In [None]:
import os
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
# change your file path here
filepath = os.path.join(os.path.curdir, "data", "train_all_tasks.csv")
df = pd.read_csv(filepath)
df = df.drop(columns=['rewire_id'])
df.head()
len(df)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#**TASK-A**

In [None]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text) 
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\s+', ' ', text) 
    return text

def remove_stopword(text, stopwords):
    return " ".join([word for word in text.split() if word not in (stop_words)])

def lemma_text(text, lemmatizer):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenize(text)]
    return " ".join(lemmatized_words)

def tokenize(text):
    return text.split()

def process_text(text, lemmatizer, stop_words):
    text = clean_text(text)
    text = remove_stopword(text, stop_words)
    text = lemma_text(text, lemmatizer)
    return text 

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
df["processed_text"] = df['text'].apply(process_text, lemmatizer = lemmatizer, stop_words = stop_words)
df.head(5)

In [None]:
#Under sampling the data
# class count
class_notsexist, class_sexist = df['label_sexist'].value_counts()
class_notsexist, class_sexist
# # Separate class
class_s = df[df['label_sexist'] == "sexist"]
class_ns = df[df['label_sexist'] == "not sexist"]
print('class_s:', class_s.shape)
print('class_ns:', class_ns.shape)

In [None]:
class_ns_under = class_ns.sample(class_sexist+1000)

df_under = pd.concat([class_ns_under, class_s], axis=0)
df_under['label_sexist'].value_counts().plot(kind='bar', title='count (target)')

In [None]:
tfidf = TfidfVectorizer()
tfidf_text_vec = tfidf.fit_transform(df_under.processed_text)

In [None]:
x_train_a, x_test_a, y_train_a, y_test_a = train_test_split(tfidf_text_vec, df_under['label_sexist'], test_size=0.2, train_size=0.8, random_state=5, shuffle=True)

**Logistic Regression**

In [None]:
lrm = LogisticRegression(penalty='l2', random_state=0).fit(x_train_a, y_train_a)
lrm_pred = lrm.predict(x_test_a)
acc = accuracy_score(lrm_pred, y_test_a)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_a, lrm_pred))
clsf_report = pd.DataFrame(classification_report(y_test_a, lrm_pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_a_cls_report_logistic_reg"), index= True)

**Decision Tree**

In [None]:
dt_clf = tree.DecisionTreeClassifier(criterion='entropy', max_features=20)
dt_clf.fit(x_train_a, y_train_a)
pred = dt_clf.predict(x_test_a)
acc = accuracy_score(pred, y_test_a)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_a, pred))
clsf_report = pd.DataFrame(classification_report(y_test_a, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_a_cls_report_logistic_decision_tree"), index= True)

**Xgboost Model**

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=5)
xgb_model.fit(x_train_a, y_train_a)

y_pred = xgb_model.predict(x_test_a)

acc = accuracy_score(y_pred, y_test_a)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_a, pred))
clsf_report = pd.DataFrame(classification_report(y_test_a, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_a_cls_report_xgboost"), index= True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier().fit(x_train_a, y_train_a)
rf_pred = rf_clf.predict(x_test_a)
acc = metrics.accuracy_score(rf_pred, y_test_a)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_a, rf_pred))
clsf_report = pd.DataFrame(classification_report(y_test_a, rf_pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_a_cls_report_random_forest"), index= True)

#**TASK-B**

In [None]:
df_b = df[df['label_category'] != 'none']
df = df_b
len(df)

In [None]:
tfidf = TfidfVectorizer()
tfidf_text_vec = tfidf.fit_transform(df.processed_text)

In [None]:
x_train_b, x_test_b, y_train_b, y_test_b = train_test_split(tfidf_text_vec, df['label_category'], test_size=0.2, train_size=0.8, random_state=5, shuffle=True)

**Decision Tree**

In [None]:
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(x_train_b, y_train_b)
pred = dt_clf.predict(x_test_b)
acc = metrics.accuracy_score(pred, y_test_b)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_b, pred))
clsf_report = pd.DataFrame(classification_report(y_test_b, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_b_cls_report_decision_tree"), index= True)

**Xgboost Classifier**

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=5)
xgb_model.fit(x_train_b, y_train_b)

y_pred = xgb_model.predict(x_test_b)

acc = metrics.accuracy_score(y_pred, y_test_b)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_b, pred))
clsf_report = pd.DataFrame(classification_report(y_test_b, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_b_cls_report_xgboost"), index= True)

**RandomForest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth=3, random_state=5).fit(x_train_b, y_train_b)
rf_pred = rf_clf.predict(x_test_b)
acc = metrics.accuracy_score(rf_pred, y_test_b)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_b, rf_pred))
clsf_report = pd.DataFrame(classification_report(y_test_b, rf_pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_b_cls_report_random_forest"), index= True)

#**TASK-C**

In [None]:
df_c = df[df['label_vector'] != 'none']
df = df_c
len(df)

In [None]:
tfidf = TfidfVectorizer()
tfidf_text_vec = tfidf.fit_transform(df.processed_text)

In [None]:
x_train_c, x_test_c, y_train_c, y_test_c = train_test_split(tfidf_text_vec, df['label_vector'], test_size=0.2, train_size=0.8, random_state=5, shuffle=True)

**Decision Tree**

In [None]:
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(x_train_c, y_train_c)
pred = dt_clf.predict(x_test_c)
acc = metrics.accuracy_score(pred, y_test_c)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_c, pred))
clsf_report = pd.DataFrame(classification_report(y_test_c, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_c_cls_report_decision_tree"), index= True)

**Xgboost Classifier**

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=2)
xgb_model.fit(x_train_c, y_train_c)

y_pred = xgb_model.predict(x_test_c)

acc = metrics.accuracy_score(y_pred, y_test_c)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_c, pred))
clsf_report = pd.DataFrame(classification_report(y_test_c, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_c_cls_report_xgboost"), index= True)

**Random Forest Classifier**

In [None]:
rf_clf = RandomForestClassifier(max_depth=10, random_state=2).fit(x_train_c, y_train_c)
rf_pred = rf_clf.predict(x_test_c)
acc = metrics.accuracy_score(rf_pred, y_test_c)
print("Accuracy:",str('{:04.2f}'.format(acc*100))+'%')
print(classification_report(y_test_c, rf_pred))
clsf_report = pd.DataFrame(classification_report(y_test_c, pred, output_dict=True)).transpose()
clsf_report.to_csv(os.path.join(os.path.curdir, "result", "task_c_cls_report_random_forest"), index= True)