In [None]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
set(stopwords.words('english'))

In [None]:
files=['../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv',
       '../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv',
       '../input/jigsaw-unintended-bias-in-toxicity-classification/all_data.csv',
       '../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv'
      ]

def load_data(file):
    return pd.read_csv(file)
with multiprocessing.Pool() as pool:
    test,train,all_data,sub=pool.map(load_data,files)

In [None]:
# for col in all_data.columns:
#     print("{} -----------> {}".format(col,all_data[col].dtypes))
#     print("{} ===========> {}".format(col,train[col].dtypes))
train.info()

In [None]:
train.target.value_counts(dropna=True).head()

In [None]:
train.shape

In [None]:
train['target'].isnull().sum()

In [None]:
X=train[['comment_text','target']]
train.columns.values

In [None]:
del train

In [None]:
tox=0
neut=0
no_of_rows=X.shape[0]
for row in range(no_of_rows):
    if X['target'][row]>0.7:
        tox+=1
    else:
        neut+=1

In [None]:
print(f'{round((tox*100)/no_of_rows,3)}% data contains toxic comments')
print(f'{round((neut*100/no_of_rows),3)}% data contains neutral comments')

# **Preprocessing comment_text for training**

In [None]:
# remove all numbers with letters attached to them
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

# '[%s]' % re.escape(string.punctuation),' ' - replace punctuation with white space
# .lower() - convert all strings to lowercase 
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

# Remove all '\n' in the string and replace it with a space
remove_n = lambda x: re.sub("\n", " ", x)

# Remove all non-ascii characters 
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)

# Apply all the lambda functions wrote previously through .map on the comments column
X['comment_text'] = X['comment_text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)

In [None]:
import wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

def wordcloud(df, label):
    
    
    subset=df[df[label]>0.7]
    text=subset.comment_text.values
    wc= WordCloud(background_color="white",max_words=4000)

    wc.generate(" ".join(text))

    plt.figure(figsize=(20,20))
    plt.subplot(221)
    plt.axis("off")
    plt.title("Words frequented in {}".format(label), fontsize=20)
    plt.imshow(wc.recolor(colormap= 'gist_earth' , random_state=244), alpha=0.98)

In [None]:
wordcloud(X,'target')

# Handling Class Imbalance

In [None]:
toxic_train=X[X['target']>0.7].iloc[0:45451,:]
toxic_train.shape

In [None]:
neutral_train=X[X['target']<=0.7].iloc[0:150000,:]
neutral_train.shape

In [None]:
balanced_train=pd.concat([toxic_train,neutral_train],axis=0)
balanced_train.shape

# balanced_train=X

In [None]:
del toxic_train, neutral_train

In [None]:
# Import packages for pre-processing
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel

# Import tools to split data and evaluate model performance
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, fbeta_score, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve

# Import ML algos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model F1 Score Comparison 

In [None]:
'''
vectorizer values: CountVectorizer, TfidfVectorizer
gram_range values: (1,1) for unigram, (2,2) for bigram
'''
def cv_tf_train_test(df_done,label,vectorizer,ngram):

    ''' Train/Test split'''
    # Split the data into X and y data sets
    X = df_done.comment_text
    y = df_done[label]

    # Split our data into training and test data 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    ''' Count Vectorizer/TF-IDF '''

    # Create a Vectorizer object and remove stopwords from the table
    cv1 = vectorizer(ngram_range=(ngram), stop_words='english')
    
    X_train_cv1 = cv1.fit_transform(X_train) # Learn the vocabulary dictionary and return term-document matrix
    X_test_cv1  = cv1.transform(X_test)      # Learn a vocabulary dictionary of all tokens in the raw documents.
    
        
    ''' Initialize all model objects and fit the models on the training data '''
    lr = LogisticRegression()
    lr.fit(X_train_cv1, y_train)
    print('lr done')

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_cv1, y_train)


    xgb=XGBClassifier()
    xgb.fit(X_train_cv1,y_train)
    
    svm_model = LinearSVC()
    svm_model.fit(X_train_cv1, y_train)

    randomforest = RandomForestClassifier(n_estimators=100, random_state=42)
    randomforest.fit(X_train_cv1, y_train)
    print('rdf done')
    
    # Create a list of F1 score of all models 
    f1_score_data = {'F1 Score':[f1_score(lr.predict(X_test_cv1), y_test), f1_score(knn.predict(X_test_cv1), y_test), 
                                f1_score(xgb.predict(X_test_cv1),y_test),
                                f1_score(svm_model.predict(X_test_cv1), y_test), f1_score(randomforest.predict(X_test_cv1), y_test)]} 
                          
    # Create DataFrame with the model names as column labels 
    df_f1 = pd.DataFrame(f1_score_data, index=['Log Regression','KNN', 'XGB', 'SVM', 'Random Forest'])

#     accuracy_data = {'Accuracy Score':[accuracy_score(lr.predict(X_test_cv1), y_test), accuracy_score(knn.predict(X_test_cv1), y_test), 
#                                 accuracy_score(xgb.predict(X_test_cv1),y_test),
#                                 accuracy_score(svm_model.predict(X_test_cv1), y_test), accuracy_score(randomforest.predict(X_test_cv1), y_test)]} 
                          
    # Create DataFrame with the model names as column labels 
#     df_acc = pd.DataFrame(accuracy_data, index=['Log Regression','KNN', 'XGB', 'SVM', 'Random Forest'])
    return df_f1


# Assigning Binary Value to Labels

In [None]:
balanced_train['target']=np.where(balanced_train['target']>0.7,1.0,0.0)
balanced_train.head()

In [None]:
import time

t0 = time.time()

df_tox_cv = cv_tf_train_test(balanced_train, 'target', TfidfVectorizer, (1,1))
df_tox_cv.rename(columns={'F1 Score': 'F1 Score(target)'}, inplace=True)

t1 = time.time()

total = 'Time taken: {} seconds'.format(t1-t0)
print(total)

df_tox_cv

In [None]:
X = balanced_train.comment_text
y = balanced_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initiate a Tfidf vectorizer
tfv = TfidfVectorizer(ngram_range=(1,1), stop_words='english')

X_train_fit = tfv.fit_transform(X_train)  # Convert the X data into a document term matrix dataframe
X_test_fit = tfv.transform(X_test)  # Converts the X_test comments into Vectorized format


# SVM CLassifier

In [None]:
# from sklearn.calibration import CalibratedClassifierCV
# svm_model = LinearSVC()
# clf = CalibratedClassifierCV(svm_model) 
# clf.fit(X_train_fit, y_train)
    

# my_ans=[]
# for row in range(test.shape[0]):
#     comment=[test['comment_text'][row]]
#     cmt=tfv.transform(comment)
#     my_ans.append(clf.predict_proba(cmt)[:,1])

# data={'id':[],
#       'prediction':[]
#      }
# df=pd.DataFrame(data)

# df['id']=test['id']

# df['prediction']=pd.DataFrame(my_ans)

# Logistic Regressor CLassifier

In [None]:
# lr=LogisticRegression()
# lr.fit(X_train_fit,y_train)

# my_ans=[]
# for row in range(test.shape[0]):
#     comment=[test['comment_text'][row]]
#     cmt=tfv.transform(comment)
#     my_ans.append(lr.predict_proba(cmt)[:,1])

# data={'id':[],
#       'prediction':[]
#      }
# df=pd.DataFrame(data)

# df['id']=test['id']

# df['prediction']=pd.DataFrame(my_ans)

# Random Forest CLassifier

In [None]:
randomforest = RandomForestClassifier(n_estimators=100,random_state=42)
randomforest.fit(X_train_fit,y_train)

my_ans=[]
for row in range(test.shape[0]):
    comment=[test['comment_text'][row]]
    cmt=tfv.transform(comment)
    my_ans.append(randomforest.predict_proba(cmt)[:,1])

data={'id':[],
      'prediction':[]
     }
df=pd.DataFrame(data)

df['id']=test['id']

df['prediction']=pd.DataFrame(my_ans)

In [None]:
df.to_csv('submission.csv',index=False)