In [1]:
## Importing relevant libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score

In [2]:
## Connecting drive to colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## Loading the dataset and embeddings

embeddings = np.load(r"/content/drive/MyDrive/Colab Notebooks/_Haider/data/embeddings.npy")
df = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/_Haider/data/articles_gender.csv")
print(f"Shape of embeddings: {embeddings.shape}")
print(f"Shape of df: {df.shape}")

Shape of embeddings: (142426, 300)
Shape of df: (142426, 8)


In [4]:
## Declaring tergets

y_sentiment = df['sentiment'].values
y_gender = df['gender'].values

In [5]:
## Splitting the data into training and testing sets

X_train, X_test, y_sentiment_train, y_sentiment_test, y_gender_train, y_gender_test = train_test_split(embeddings, y_sentiment, y_gender,
                                                                                                       test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_sentiment_train shape:", y_sentiment_train.shape)
print("y_sentiment_test shape:", y_sentiment_test.shape)
print("y_gender_train shape:", y_gender_train.shape)
print("y_gender_test shape:", y_gender_test.shape)

X_train shape: (113940, 300)
X_test shape: (28486, 300)
y_sentiment_train shape: (113940,)
y_sentiment_test shape: (28486,)
y_gender_train shape: (113940,)
y_gender_test shape: (28486,)


In [6]:
## Creating data frame to store results

results_df = pd.DataFrame(columns=['Model', 'Accuracy (Sentiment)', 'F1 Score (Sentiment)', 'Accuracy (Gender)', 'F1 Score (Gender)'])

In [7]:
## scaling the features

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
## KNN Classifier

model_sentiment = KNeighborsClassifier(n_neighbors=7)
model_sentiment.fit(X_train, y_sentiment_train)
y_sentiment_pred = model_sentiment.predict(X_test)

accuracy_sentiment = accuracy_score(y_sentiment_test, y_sentiment_pred)
f1_sentiment = f1_score(y_sentiment_test, y_sentiment_pred, average='weighted')

model_gender = KNeighborsClassifier(n_neighbors=7)
model_gender.fit(X_train, y_gender_train)
y_gender_pred = model_gender.predict(X_test)

accuracy_gender = accuracy_score(y_gender_test, y_gender_pred)
f1_gender = f1_score(y_gender_test, y_gender_pred, average='weighted')

## Adding the performance in the results dataframe

results_df.loc[len(results_df)] = ['KNN', accuracy_sentiment, f1_sentiment, accuracy_gender, f1_gender]
results_df

Unnamed: 0,Model,Accuracy (Sentiment),F1 Score (Sentiment),Accuracy (Gender),F1 Score (Gender)
0,KNN,0.733237,0.725829,0.790248,0.754079


In [None]:
# Sentiment analysis using Gaussian Naive Bayes
model_sentiment = GaussianNB()
model_sentiment.fit(X_train, y_sentiment_train)
y_sentiment_pred = model_sentiment.predict(X_test)

accuracy_sentiment = accuracy_score(y_sentiment_test, y_sentiment_pred)
f1_sentiment = f1_score(y_sentiment_test, y_sentiment_pred, average='weighted')

# Gender classification using Gaussian Naive Bayes
model_gender = GaussianNB()
model_gender.fit(X_train, y_gender_train)
y_gender_pred = model_gender.predict(X_test)

accuracy_gender = accuracy_score(y_gender_test, y_gender_pred)
f1_gender = f1_score(y_gender_test, y_gender_pred, average='weighted')

# Adding the performance in the results dataframe
results_df.loc[len(results_df)] = ['Gaussian Naive Bayes', accuracy_sentiment, f1_sentiment, accuracy_gender, f1_gender]
results_df

Unnamed: 0,Model,Accuracy (Sentiment),F1 Score (Sentiment),Accuracy (Gender),F1 Score (Gender)
0,KNN,0.733237,0.725829,0.790248,0.754079
1,Gaussian Naive Bayes,0.631117,0.671523,0.717932,0.721197


In [None]:
# Sentiment analysis using Decision Tree
model_sentiment = DecisionTreeClassifier()
model_sentiment.fit(X_train, y_sentiment_train)
y_sentiment_pred = model_sentiment.predict(X_test)

accuracy_sentiment = accuracy_score(y_sentiment_test, y_sentiment_pred)
f1_sentiment = f1_score(y_sentiment_test, y_sentiment_pred, average='weighted')

# Gender classification using Decision Tree
model_gender = DecisionTreeClassifier()
model_gender.fit(X_train, y_gender_train)
y_gender_pred = model_gender.predict(X_test)

accuracy_gender = accuracy_score(y_gender_test, y_gender_pred)
f1_gender = f1_score(y_gender_test, y_gender_pred, average='weighted')

# Adding the performance in the results dataframe
results_df.loc[len(results_df)] = ['Decision Tree', accuracy_sentiment, f1_sentiment, accuracy_gender, f1_gender]
results_df

Unnamed: 0,Model,Accuracy (Sentiment),F1 Score (Sentiment),Accuracy (Gender),F1 Score (Gender)
0,KNN,0.733237,0.725829,0.790248,0.754079
1,Gaussian Naive Bayes,0.631117,0.671523,0.717932,0.721197
2,Decision Tree,0.609457,0.610576,0.658569,0.664801


In [None]:
# Sentiment analysis using Random Forest
model_sentiment = RandomForestClassifier(n_estimators=100, random_state=42)
model_sentiment.fit(X_train, y_sentiment_train)
y_sentiment_pred = model_sentiment.predict(X_test)

accuracy_sentiment = accuracy_score(y_sentiment_test, y_sentiment_pred)
f1_sentiment = f1_score(y_sentiment_test, y_sentiment_pred, average='weighted')

# Gender classification using Random Forest
model_gender = RandomForestClassifier(n_estimators=100, random_state=42)
model_gender.fit(X_train, y_gender_train)
y_gender_pred = model_gender.predict(X_test)

accuracy_gender = accuracy_score(y_gender_test, y_gender_pred)
f1_gender = f1_score(y_gender_test, y_gender_pred, average='weighted')

# Adding the performance in the results dataframe
results_df.loc[len(results_df)] = ['Random Forest', accuracy_sentiment, f1_sentiment, accuracy_gender, f1_gender]
results_df

Unnamed: 0,Model,Accuracy (Sentiment),F1 Score (Sentiment),Accuracy (Gender),F1 Score (Gender)
0,KNN,0.733237,0.725829,0.790248,0.754079
1,Gaussian Naive Bayes,0.631117,0.671523,0.717932,0.721197
2,Decision Tree,0.609457,0.610576,0.658569,0.664801
3,Random Forest,0.731482,0.713407,0.783016,0.708634


In [9]:
# Sentiment analysis using SVM
model_sentiment = SVC(kernel='linear', random_state=42)
model_sentiment.fit(X_train, y_sentiment_train)
y_sentiment_pred = model_sentiment.predict(X_test)

accuracy_sentiment = accuracy_score(y_sentiment_test, y_sentiment_pred)
f1_sentiment = f1_score(y_sentiment_test, y_sentiment_pred, average='weighted')

# Gender classification using SVM
model_gender = SVC(kernel='linear', random_state=42)
model_gender.fit(X_train, y_gender_train)
y_gender_pred = model_gender.predict(X_test)

accuracy_gender = accuracy_score(y_gender_test, y_gender_pred)
f1_gender = f1_score(y_gender_test, y_gender_pred, average='weighted')

# Adding the performance in the results dataframe
results_df.loc[len(results_df)] = ['SVM', accuracy_sentiment, f1_sentiment, accuracy_gender, f1_gender]
results_df

Unnamed: 0,Model,Accuracy (Sentiment),F1 Score (Sentiment),Accuracy (Gender),F1 Score (Gender)
0,KNN,0.733237,0.725829,0.790248,0.754079
1,Gaussian Naive Bayes,0.631117,0.671523,0.717932,0.721197
2,Decision Tree,0.609457,0.610576,0.658569,0.664801
3,Random Forest,0.731482,0.713407,0.783016,0.708634
4,SVM,0.721239,0.700125,0.738593,0.719573


In [10]:
results_df.to_csv('/content/drive/MyDrive/Colab Notebooks/_Haider/data/ml_results.csv', index=False)