In [None]:
import numpy as np
import pandas as pd
import nltk

from sklearn.tree imodel import LogisticRegression
from sklearn.svm impomport DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_rt SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set a fixed random seed
import random
random.seed(42)
np.random.seed(42)

# Read data
df = pd.read_csv('twitter_parsed_dataset.csv')
df = pd.read_csv('twitter_parsed_dataset.csv')
print(df.columns)
# Preprocessing
df = df[df['text'].apply(lambda x: isinstance(x, str))]  # Keep only rows where 'text' is a string
df = df[df['label'].isin([0, 1])]  # Keep only rows where 'label' is 0 or 1
df.drop_duplicates(inplace=True)

# Convert to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x: ' '.join([ps.stem(word) for word in word_tokenize(x)]))

# Oversampling
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_resample(df[['text']], df['label'])

# Convert resampled data back to pandas dataframes
x_resampled = pd.DataFrame(x_resampled, columns=['text'])
y_resampled = pd.Series(y_resampled, name='label')

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x_resampled['text'], y_resampled, test_size=0.15, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Define base models
base_model_1 = DecisionTreeClassifier(random_state=42)
base_model_2 = KNeighborsClassifier()
base_model_3 = GaussianNB()
base_model_4 = LogisticRegression(random_state=42)
base_model_5 = SVC(kernel='linear', random_state=42)

# Train and evaluate each individual base model
for i, base_model in enumerate([base_model_1, base_model_2, base_model_3, base_model_4, base_model_5], 1):
    if isinstance(base_model, GaussianNB):
        base_model.fit(X_train_vect.toarray(), y_train)
        base_model_pred = base_model.predict(X_test_vect.toarray())
    else:
        base_model.fit(X_train_vect, y_train)
        base_model_pred = base_model.predict(X_test_vect)
    print(f'Accuracy of base model {i}: ', accuracy_score(y_test, base_model_pred))

Accuracy of base model 1:  0.9055346276441611

Accuracy of base model 2:  0.8038249782671689

Accuracy of base model 3:  0.744711677774558

Accuracy of base model 4:  0.8753984352361635

Accuracy of base model 5:  0.8930744711677775


In [None]:
import pandas as pd
import numpy as np
import nltk

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import RandomOverSampler
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set a fixed random seed
import random
random.seed(42)
np.random.seed(42)

# Read data
df = pd.read_csv('twitter_parsed_dataset.csv')

# Preprocessing
df = df[df['text'].apply(lambda x: isinstance(x, str))]  # Keep only rows where 'text' is a string
df = df[df['label'].isin([0, 1])]  # Keep only rows where 'label' is 0 or 1
df.drop_duplicates(inplace=True)

# Convert to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x: ' '.join([ps.stem(word) for word in word_tokenize(x)]))

# Oversampling
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_resample(df[['text']], df['label'])

# Convert resampled data back to pandas dataframes
x_resampled = pd.DataFrame(x_resampled, columns=['text'])
y_resampled = pd.Series(y_resampled, name='label')

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x_resampled['text'], y_resampled, test_size=0.15, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Define base models
base_models = [
    ("dt_model", DecisionTreeClassifier(random_state=42)),
    ("knn_model", KNeighborsClassifier(n_neighbors=5))
]

# Define StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)

# Train the Stacking model
stacking_model.fit(X_train_vect, y_train)

# Predict the labels for the test data using the Stacking model
stacking_pred = stacking_model.predict(X_test_vect)

# Print the accuracy of the Stacking model
print('Accuracy of Stacking model: ', accuracy_score(y_test, stacking_pred))

# Print Stacking model metrics
print("Stacking Model Precision: %.2f%%" % (precision_score(y_test, stacking_pred) * 100))
print("Stacking Model Recall: %.2f%%" % (recall_score(y_test, stacking_pred) * 100))
print("Stacking Model F1: %.2f%%" % (f1_score(y_test, stacking_pred) * 100))
print("Stacking Model ROC AUC: %.2f%%" % (roc_auc_score(y_test, stacking_pred) * 100))

Accuracy of Stacking model:  0.9203129527673138

Stacking Model Precision: 90.02%

Stacking Model Recall: 94.22%

Stacking Model F1: 92.07%

Stacking Model ROC AUC: 92.07%