# Midterm Assignment - Module 7
## Author: Victor Armenta-Valdes

### Import libraries

In [None]:
import pandas as pd
import os
import zipfile
import matplotlib.pyplot as plt
import spacy
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

### Unzip files

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        with zipfile.ZipFile(os.path.join(dirname, filename),"r") as d:
            d.extractall(".")

### Importing Datasets as Pandas DataFrames

In [None]:
# Importing data
train = pd.read_csv('./train.csv',encoding = 'ISO-8859-1')
test_labels = pd.read_csv('./test_labels.csv',encoding = 'ISO-8859-1')
test = pd.read_csv('./test.csv',encoding = 'ISO-8859-1')

### Initial Data Exploration

In [None]:
# Viewing training data
train.head()

In [None]:
# Sample text
train.comment_text[0]

In [None]:
# Training stats
train.describe()

In [None]:
# Shape of training data
train.shape

In [None]:
# Viewing test data
test.head()

In [None]:
# Test data shape
test.shape

In [None]:
# Viewing test labels
test_labels.head()

In [None]:
# Testing stats
test_labels.describe()

In [None]:
# Shape of test_labels
test_labels.shape

In [None]:
# Combine test datasets on 'id'
test_combi=pd.merge(test,test_labels,on='id')

In [None]:
# Initial view of new test dataset
test_combi.head()

In [None]:
# New shape of test dataset
test_combi.shape

In [None]:
# Dropping test label rows where equals -1
test_combi = test_combi[
    (test_combi.toxic != -1) | (test_combi.severe_toxic != -1) |
    (test_combi.obscene != -1) | (test_combi.threat != -1) |
    (test_combi.insult != -1) | (test_combi.identity_hate != -1)
]
test_combi = test_combi.reset_index(drop=True)

In [None]:
# New testing stats after dropping rows
test_combi.describe()

In [None]:
# New shape of test_labels after removing rows
test_combi.shape

In [None]:
# Find number of null records in the training dataset
train.isna().any()

In [None]:
# Find number of null records in the test dataset
test_combi.isna().any()

In [None]:
# Look at record counts for training data
fig, axes = plt.subplots(3,2,figsize=(16,16))
for col, h in zip(list(train.columns[2:]),axes.flatten()):
    h.hist(train[col], label='training')
    counts, edges, bars = h.hist(train[col])
    h.bar_label(bars)
    h.set_title(f"{col}"+' - training data')
    h.set(xlabel='category', ylabel = 'count')
    h.set_yscale('log')

In [None]:
# Look at record counts for test data
fig, axes = plt.subplots(3,2,figsize=(16,16))
for col, h in zip(list(test_combi.columns[2:]),axes.flatten()):
    h.hist(test_combi[col], label='testing')
    counts, edges, bars = h.hist(test_combi[col])
    h.bar_label(bars)
    h.set_title(f"{col}"+' - testing data')
    h.set(xlabel='category', ylabel = 'count')
    h.set_yscale('log')

### Initial data cleaning

In [None]:
# View comments that contain URLs
train[train.comment_text.str.contains('http')==True]

In [None]:
# View sample comment
train.comment_text[22]

In [None]:
# View comments with special characters
train[train.comment_text.str.contains(r'[\\\^\=\&\%\$\£\@\<\>\[\]\{\}\+\-\:\;]?',regex=True)==True]

In [None]:
# View sample comment
train.comment_text[159566]

In [None]:
# Remove URLs and special characters and "\n" 
train['comment_text'] = train.comment_text.str.replace(r'\s*https?://\S+(\s+|$)', ' ',regex=True).str.strip()
train['comment_text'] = train.comment_text.str.replace('\n', ' ',regex=False).str.strip()
#train['comment_text'] = train.comment_text.str.replace('=', ' ',regex=False).str.strip()
train['comment_text'] = train.comment_text.str.replace(r'[\\\^\=\&\%\$\£\@\<\>\[\]\{\}\+\-\:\;\"\(\)]', ' ',regex=True).str.strip()
train['comment_text'] = train.comment_text.str.replace(r'[0-9]', ' ',regex=True).str.strip()

In [None]:
# View sample comment
train.comment_text[22]

In [None]:
# View sample comment
train.comment_text[159566]

In [None]:
# View sample comment
train.comment_text[0]

### Using Spacy to count sentences

In [None]:
# Convert to doc 
nlp = spacy.load('en_core_web_sm')
train_sen = train['comment_text'].values.tolist()
sent = []
for sen in train_sen:
    doc = nlp(sen)
    sentences = list(doc.sents)
    sent.append(len(sentences))
train['sent_count']=pd.Series(sent)        

In [None]:
# New look at dataset
train.head()

In [None]:
# Stats on new sentence count
train.sent_count.describe()

In [None]:
print('On average, there are around ' + str(train.sent_count.mean()) + ' words per record')
print('In total, there are ' + str(train.sent_count.sum()) + ' sentences in the training dataset')

In [None]:
# Creating a non-toxic column for data analysis
train['non_toxic'] = (1 - train.iloc[:,2:7].sum(axis=1) > 0).astype(int)

In [None]:
# New look at data
train.head()

In [None]:
# Create dataframe of sentence statistics
d = {'toxic': [train.loc[train.toxic==1,'sent_count'].sum(),train.loc[train.toxic==1,'sent_count'].mean()],
     'severe_toxic': [train.loc[train.severe_toxic==1,'sent_count'].sum(),train.loc[train.severe_toxic==1,'sent_count'].mean()],
     'obscene': [train.loc[train.obscene==1,'sent_count'].sum(),train.loc[train.obscene==1,'sent_count'].mean()],
     'threat': [train.loc[train.threat==1,'sent_count'].sum(),train.loc[train.threat==1,'sent_count'].mean()],
     'insult': [train.loc[train.insult==1,'sent_count'].sum(),train.loc[train.insult==1,'sent_count'].mean()],
     'identity_hate': [train.loc[train.identity_hate==1,'sent_count'].sum(),train.loc[train.identity_hate==1,'sent_count'].mean()],
     'non_toxic': [train.loc[train.non_toxic==1,'sent_count'].sum(),train.loc[train.non_toxic==1,'sent_count'].mean()]
    }
train_sent_count = pd.DataFrame(data=d, index=['sum','mean'])
train_sent_count.head()

In [None]:
# Creating histogram of training data - records
cat, count = zip(*train.iloc[:,[2,3,4,5,6,7,9]].sum(axis=0).items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("class distribution (records) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

In [None]:
# Creating histogram of training data - sentences
cat, count = zip(*train_sent_count.iloc[0,:].items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("class distribution (total sentences) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

In [None]:
# Creating histogram of training data - sentences
cat, count = zip(*train_sent_count.iloc[1,:].items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("class distribution (average sentences) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

### Downsampling non-toxic comments

In [None]:
train.head()

In [None]:
# Splitting datasets
all_toxic_comments = train[train.non_toxic != 1]
non_toxic_comments = train[train.non_toxic == 1]

In [None]:
# Average records
cat = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']
avg = 0
for i in cat:
    avg += all_toxic_comments[i].sum()
avg = int(avg/6)

In [None]:
# Sampling the average number of rows
non_toxic_comments = non_toxic_comments.sample(n=avg)

In [None]:
# Remerging into training dataset
train = pd.concat([all_toxic_comments,non_toxic_comments])
train = train.reset_index(drop=True)

In [None]:
# Creating histogram of training data - records
cat, count = zip(*train.iloc[:,[2,3,4,5,6,7,9]].sum(axis=0).items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("new class distribution (records) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

### Pre-Processing and Tokenisation

In [None]:
# Taking sample string
sample = str(train.comment_text[0])
sample

In [None]:
# Convert to doc
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)

In [None]:
# Viewing tokens in sample
for token in doc:
    print (token, token.idx)

In [None]:
# Setting preprocessing steps

def valid_tokens(token):
    # Removes stop words and punctuation from tokens
    # 1) strips string
    # 2) removes stop words
    # 3) removes punctuation
    if (not token or not token.text.strip() or token.is_stop or token.is_punct):
        return False
    return True

def preprocess(token):
    # Converts to lowercase lemma
    return token.lemma_.strip().lower()
samp_token = [preprocess(token) for token in doc if valid_tokens(token)]
samp_token

In [None]:
# Setting preprocessing steps for all training text

# Convert to doc 
nlp = spacy.load('en_core_web_sm')
train_sen = train['comment_text'].values.tolist()

reduced_sent = []
token_count = []
for sen in train_sen:
    doc = nlp(sen)
    tok = [preprocess(token) for token in doc if valid_tokens(token)]
    token_count.append(len(tok))
    tok = ' '.join(tok)
    reduced_sent.append(tok)
train['reduced_sent']=pd.Series(reduced_sent)   
train['token_count']=pd.Series(token_count)

In [None]:
# New look at data
train

In [None]:
# Apply similar logic for test data

# Convert to doc 
nlp = spacy.load('en_core_web_sm')
test_sen = test_combi['comment_text'].values.tolist()

reduced_sent = []
for sen in test_sen:
    doc = nlp(sen)
    tok = [preprocess(token) for token in doc if valid_tokens(token)]
    tok = ' '.join(tok)
    reduced_sent.append(tok)
test_combi['reduced_sent']=pd.Series(reduced_sent)

In [None]:
# Look at token_count column
train.token_count.describe()

In [None]:
print('On average, there are around ' + str(train.token_count.mean()) + ' tokens per record')
print('In total, there are ' + str(train.token_count.sum()) + ' tokens in the training dataset')

In [None]:
# Create dataframe of token statistics
d = {'toxic': [train.loc[train.toxic==1,'token_count'].sum(),train.loc[train.toxic==1,'token_count'].mean()],
     'severe_toxic': [train.loc[train.severe_toxic==1,'token_count'].sum(),train.loc[train.severe_toxic==1,'token_count'].mean()],
     'obscene': [train.loc[train.obscene==1,'token_count'].sum(),train.loc[train.obscene==1,'token_count'].mean()],
     'threat': [train.loc[train.threat==1,'token_count'].sum(),train.loc[train.threat==1,'token_count'].mean()],
     'insult': [train.loc[train.insult==1,'token_count'].sum(),train.loc[train.insult==1,'token_count'].mean()],
     'identity_hate': [train.loc[train.identity_hate==1,'token_count'].sum(),train.loc[train.identity_hate==1,'token_count'].mean()],
     'non_toxic': [train.loc[train.non_toxic==1,'token_count'].sum(),train.loc[train.non_toxic==1,'token_count'].mean()]
    }
train_token_count = pd.DataFrame(data=d, index=['sum','mean'])
train_token_count.head()

In [None]:
# Creating histogram of training data - total tokens 
cat, count = zip(*train_token_count.iloc[0,:].items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("class distribution (total tokens) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

In [None]:
# Creating histogram of training data - average tokens
cat, count = zip(*train_token_count.iloc[1,:].items())
plt.figure(figsize=(10,10))
plt.bar(x=cat, height=count)
plt.title("class distribution (average tokens) - training dataset")
plt.ylabel('count')
plt.xlabel('category')

### Display common words per class

In [None]:
# Common words in toxic
text = [text for text in train.reduced_sent[train.toxic==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in severe_toxic
text = [text for text in train.reduced_sent[train.severe_toxic==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in obscene
text = [text for text in train.reduced_sent[train.obscene==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in threat
text = [text for text in train.reduced_sent[train.threat==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in insult
text = [text for text in train.reduced_sent[train.insult==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in identity_hate
text = [text for text in train.reduced_sent[train.identity_hate==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

In [None]:
# Common words in non_toxic
text = [text for text in train.reduced_sent[train.non_toxic==1]]
joined = ' '.join(text).split()
counter = Counter(joined)
print(counter.most_common(5))

### Bag of Words Feature Extraction

In [None]:
# Convert preprocessed sentences to lists
train_text = list(train['reduced_sent'])
test_text = list(test_combi['reduced_sent'])

cv = CountVectorizer(max_features=10000, ngram_range=(1,2))

X_train = cv.fit_transform(train_text)
X_test = cv.fit_transform(test_text)

### Models

In [None]:
# Setting categories and labels
cat = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y_train = train[cat]
y_test = test_combi[cat]

In [None]:
# Dimensions
print('X_train: ' + str(X_train.shape))
print('y_train: ' +str(y_train.shape))
print('X_test: ' +str(X_test.shape))
print('y_test: ' +str(y_test.shape))

#### Logistic Regression

In [None]:
lr = MultiOutputClassifier(estimator=LogisticRegression()).fit(X_train,y_train)

In [None]:
y_pred_lr1 = lr.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_lr1)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_lr1,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_lr1,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_lr1,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_lr1,average='macro', multi_class='ovr')))

#### Decision Tree

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=0).fit(X_train,y_train)

In [None]:
y_pred_dt1 = dt.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_dt1)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_dt1,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_dt1,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_dt1,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_dt1,average='macro', multi_class='ovr')))

#### Random Forest

In [None]:
# Random Forest
forest = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)

In [None]:
y_pred_rf1 = forest.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_rf1)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_rf1,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_rf1,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_rf1,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_rf1,average='macro', multi_class='ovr')))

#### KNN

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4).fit(X_train,y_train)

In [None]:
y_pred_knn1 = knn.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_knn1)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_knn1,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_knn1,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_knn1,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_knn1,average='macro', multi_class='ovr')))

### TF-IDF

In [None]:
# Convert preprocessed sentences to lists
train_text = list(train['reduced_sent'])
test_text = list(test_combi['reduced_sent'])

In [None]:
# Convert to TD-IDF Vector
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf.fit(train_text)
X_train = tfidf.transform(train_text)
X_test = tfidf.transform(test_text)

### Models

#### Logistic Regression

In [None]:
lr = MultiOutputClassifier(estimator=LogisticRegression()).fit(X_train,y_train)

In [None]:
y_pred_lr2 = lr.predict(X_test)
acc_lr2 = accuracy_score(y_test,y_pred_lr2)
print('accuracy_score: ' + str(acc_lr2))
print('f1_score: ' +  str(f1_score(y_test,y_pred_lr2,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_lr2,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_lr2,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_lr2,average='macro', multi_class='ovr')))

#### Decision Tree

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=0).fit(X_train,y_train)

In [None]:
y_pred_dt2 = dt.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_dt2)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_dt2,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_dt2,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_dt2,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_dt2,average='macro', multi_class='ovr')))

#### Random Forest

In [None]:
# Random Forest
forest = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)

In [None]:
y_pred_rf2 = forest.predict(X_test)
acc_rf2 = accuracy_score(y_test,y_pred_rf2)
print('accuracy_score: ' + str(acc_rf2))
print('f1_score: ' +  str(f1_score(y_test,y_pred_rf2,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_rf2,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_rf2,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_rf2,average='macro', multi_class='ovr')))

#### KNN

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=4).fit(X_train,y_train)

In [None]:
y_pred_knn2 = knn.predict(X_test)
print('accuracy_score: ' + str(accuracy_score(y_test,y_pred_knn2)))
print('f1_score: ' +  str(f1_score(y_test,y_pred_knn2,average='samples',zero_division=1)))
print('precision_score: ' +str(precision_score(y_test,y_pred_knn2,average='samples',zero_division=1)))
print('recall_score: ' +str(recall_score(y_test,y_pred_knn2,average='samples',zero_division=1)))
print('roc_auc_score: '+str(roc_auc_score(y_test,y_pred_knn2,average='macro', multi_class='ovr')))

### Submission

In [None]:
# Creating submission file
if acc_rf2 >= acc_lr2:
    submit = pd.DataFrame({'id':test_combi.id, 'toxic':y_pred_rf2[:,0],'severe_toxic':y_pred_rf2[:,1],'obscene':y_pred_rf2[:,2],'threat':y_pred_rf2[:,3],'insult':y_pred_rf2[:,4],'identity_hate':y_pred_rf2[:,5]})
else:
    submit = pd.DataFrame({'id':test_combi.id, 'toxic':y_pred_lr2[:,0],'severe_toxic':y_pred_lr2[:,1],'obscene':y_pred_lr2[:,2],'threat':y_pred_lr2[:,3],'insult':y_pred_lr2[:,4],'identity_hate':y_pred_lr2[:,5]})

In [None]:
# Reapplying labels with -1 for Kaggle submission - has no effect on scoring
test_na = test_labels[
    (test_labels.toxic == -1) | (test_labels.severe_toxic == -1) |
    (test_labels.obscene == -1) | (test_labels.threat == -1) |
    (test_labels.insult == -1) | (test_labels.identity_hate == -1)
]
test_na

In [None]:
# Combine datasets on 'id'
submit=pd.concat([submit,test_na],ignore_index=True)

In [None]:
# Export
submit.to_csv('submission_file.csv',index=False)