In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

import pickle 
#import mglearn
import time
import warnings
import string


from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer # doesn't split at apostrophes
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.metrics import roc_auc_score , accuracy_score , confusion_matrix , f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# CountVectorizer

In [None]:
warnings.filterwarnings("ignore")
txt = ["He is ::having a great Time, at the park time?",
       "She, unlike most women, is a big player on the park's grass.",
       "she can't be going"]

In [None]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# Transforms the data into a bag of words
count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)
print("Encoded Document is:")
print(bag_of_words.toarray())

# Print the first 10 features of the count_vec
print("Every feature:\n{}".format(count_vec.get_feature_names()))

In [None]:
print("Vocabulary size: {}".format(len(count_train.vocabulary_)))
print("Vocabulary content:\n {}".format(count_train.vocabulary_))

## Doing CountVectorizer just with different N-Gram range

In [None]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 5), max_df=1.0, min_df=1, max_features=None)

# Transforms the data into a bag of words
count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)
print(bag_of_words.toarray())

# Print the first 10 features of the count_vec
print("Every feature:\n{}".format(count_vec.get_feature_names()))

## Doing CountVectorizer and fetching features in the desired range of max and min Document Frequency

In [None]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=0.5, min_df=0.3, max_features=None)

count_train = count_vec.fit_transform(txt)

print(count_vec.get_feature_names())
print('Displays terms whose document frequency is between 0.3 and 0.5')

## Fetching features only upto desired capacity specifed by user for learning vectorizer

In [None]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4)

count_train = count_vec.fit_transform(txt)
print(count_vec.get_feature_names())

# TF-IDF Vectorizer

In [None]:
txt1 = ['This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Good this is nice',
        'Is this the first document?']

In [None]:
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted = tf.fit(txt1)
txt_transformed = txt_fitted.transform(txt1)
print ("The text: ", txt1)

In [None]:
tf.vocabulary_

In [None]:
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf))) #The one with the most value of TF-IDF is considered as discriminative

In [None]:
rr = dict(zip(txt_fitted.get_feature_names(), idf))

In [None]:
token_weight = pd.DataFrame.from_dict(rr, orient='index').reset_index()
token_weight.columns=('token','weight')
token_weight 

sns.barplot(x='token', y='weight', data=token_weight)            
plt.title("Inverse Document Frequency(idf) per token")
fig=plt.gcf()
fig.set_size_inches(14,5)
plt.show()

## Finding features with lowest and highest IDF

In [None]:
feature_names = np.array(tf.get_feature_names())
sorted_by_idf = np.argsort(tf.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:3]]))
print("\nFeatures with highest idf:\n{}".format(
       feature_names[sorted_by_idf[-3:]]))

In [None]:
txt_transformed.toarray()

## Finding features with lowest and highest TFIDF

In [None]:
new1 = tf.transform(txt1)

# find maximum value for each of the features over all of dataset:
max_val = new1.max(axis=0).toarray().ravel()

#sort weights from smallest to biggest and extract their indices 
sort_by_tfidf = max_val.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sort_by_tfidf[:3]]))

print("\nFeatures with highest tfidf: \n{}".format(
      feature_names[sort_by_tfidf[-3:]]))

# Comment Classification

In [None]:
train_data  =  pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_data  =  pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_target =  pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')


In [None]:
test_target.head()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.describe()

In [None]:
comments = train_data.drop(['id','comment_text'],axis = 1)
for i in comments.columns :
    print("Percent of {0}s: ".format(i), round(100*comments[i].mean(),2), "%")

In [None]:
classes = {}
for i in list(comments.columns):
    classes[i] =  comments[i].sum()
n_classes = [classes[i] for i in list(classes.keys())]
classes = list(classes.keys())
print(n_classes) #Total toxic comments hold by Respective classes
print(sum(n_classes)) #Total number of rows out of 159570 which contain toxic comments

In [None]:
plt.figure(figsize=(15,12))
fig, ax = plt.subplots()
ax.bar(classes,n_classes,color='cyan')

In [None]:
#Converting apostrophe characters into normal characters
def  clean_text(text):
    text =  text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    
    return text

In [None]:
#Applying the above function in the dataset
train_data.comment_text = train_data.comment_text.apply(clean_text)

In [None]:
nltk.download('stopwords')

In [None]:
sn = SnowballStemmer(language='english')


def stemmer(text):
    words =  text.split()
    train = [sn.stem(word) for word in words if not word in set(stopwords.words('english'))]
    return ' '.join(train)

In [None]:
train_data.comment_text = train_data.comment_text.apply(stemmer)

In [None]:
x =  train_data.comment_text
y =  train_data.drop(['id','comment_text'],axis = 1)

x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size = 0.2,random_state = 45)

In [None]:
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(x_train)    
train_word_features = word_vectorizer.transform(x_train)

In [None]:
X_train_transformed = word_vectorizer.transform(x_train)
X_test_transformed = word_vectorizer.transform(x_test)

In [None]:
print(X_train_transformed)

In [None]:
log_reg = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=45)

classifier = OneVsRestClassifier(log_reg)
classifier.fit(X_train_transformed, y_train)


y_train_pred_proba = classifier.predict_proba(X_train_transformed)
y_test_pred_proba = classifier.predict_proba(X_test_transformed)


roc_auc_score_train = roc_auc_score(y_train, y_train_pred_proba,average='weighted')
roc_auc_score_test = roc_auc_score(y_test, y_test_pred_proba,average='weighted')

print("ROC AUC Score Train:", roc_auc_score_train)
print("ROC AUC Score Test:", roc_auc_score_test)


In [None]:
def make_test_predictions(df,classifier):
    df.comment_text = df.comment_text.apply(clean_text)
    df.comment_text = df.comment_text.apply(stemmer)
    X_test = df.comment_text
    X_test_transformed = word_vectorizer.transform(X_test)
    y_test_pred = classifier.predict_proba(X_test_transformed)
    return y_test_pred
    y_test_pred_df = pd.DataFrame(y_test_pred,columns=comments.columns) 
    submission_df = pd.concat([df.id, y_test_pred_df], axis=1)
    submission_df.to_csv('19BCE226_Submission.csv', index = False)

In [None]:
print(type(test_data))

In [None]:
make_test_predictions(test_data,classifier)