In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import math
import gc
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
# from imblearn.over_sampling import SMOTE # for over-sampling
import joblib # for saving models
import warnings
warnings.filterwarnings('ignore')

### Importing cleaned dataframes

In [None]:
df=joblib.load('/kaggle/input/toxic-comment-classification-cleaned/df.pkl')
df_test=joblib.load('/kaggle/input/toxic-comment-classification-cleaned/df_test.pkl')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df_test.head()

In [None]:
df.isnull().sum()

### Reducing dataframe size

In [None]:
#creating reduce mem function
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
      print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df=reduce_mem_usage(df)
gc.collect()

In [None]:
df_test=reduce_mem_usage(df_test)
gc.collect()

### EDA

##### Distribution of Classes

In [None]:
fig,axes=plt.subplots(3,2,figsize=(15,15))

for ax,class_name in zip(axes.flatten(),['toxic','severe_toxic','obscene','threat','insult','identity_hate']):
    pd.value_counts(df[class_name],sort=True).plot(kind='bar',rot=0,ax=ax)
    ax.set_title('{} Distribution'.format(class_name))
    ax.set_xticks(range(2),[0,1])
    ax.set_xlabel('Labels')
    ax.set_ylabel('Frequency')

plt.show()

### Feature Engineering

In [None]:
all_text=pd.concat([df['lemmatized'], df_test['lemmatized']]).reset_index(drop=True)
all_text.head()

#### Creating TF/IDF vectors

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# # initialize TFIDF
# word_vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features=10000,analyzer='word',dtype=np.float32)
# char_vectorizer=TfidfVectorizer(analyzer='char',ngram_range=(2, 6),max_features=30000)

In [None]:
# word_vectorizer.fit(all_text)
# char_vectorizer.fit(all_text)

In [None]:
# # create TFIDF for train
# train_word_features = word_vectorizer.transform(df['lemmatized'])
# train_char_features=char_vectorizer.transform(df['lemmatized'])
# gc.collect()

In [None]:
# train_char_features

In [None]:
# train_word_features

In [None]:
# # create TFIDF for test
# test_word_features = word_vectorizer.transform(df_test['lemmatized'])
# test_char_features=char_vectorizer.transform(df_test['lemmatized'])
# gc.collect()

In [None]:
# test_char_features

In [None]:
# test_word_features

In [None]:
# from scipy.sparse import hstack

In [None]:
# train_features=hstack([train_word_features,train_char_features])
# test_features=hstack([test_word_features,test_char_features])

In [None]:
# train_features.shape

In [None]:
# test_features.shape

#### Training Word2Vec

In [None]:
# # Creating data for the model training
# comments=[]
# for i in all_text:
#     comments.append(i.split())
# comments[:5]

In [None]:
# from gensim.models import Word2Vec

In [None]:
# # training a word2vec model from the given data set
# w2v_model = Word2Vec(comments, size=300, min_count=2,window=4, sg=1,workers=4)

In [None]:
# # vocabulary size
# print('vocabulary size:', len(w2v_model.wv.vocab))

In [None]:
# # returns vector reperesentation of a given word if it is present in vocabulary
# def get_embedding_w2v(doc_tokens):
#     embeddings = []
#     if len(doc_tokens)<1:
#         return np.zeros(300)
#     else:
#         for tok in doc_tokens:
#             if tok in w2v_model.wv.vocab:
#                 embeddings.append(w2v_model.wv.word_vec(tok))
#             else:
#                 embeddings.append(np.random.rand(300))
#         # mean the vectors of individual words to get the vector of the statement
#         return np.mean(embeddings, axis=0)

In [None]:
# X=df['lemmatized'].apply(lambda x :get_embedding_w2v(x.split()))
# X=pd.DataFrame(X.tolist())
# print('Shape of X=>',X.shape)

In [None]:
# X_test=df_test['lemmatized'].apply(lambda x:get_embedding_w2v(x.split()))
# X_test=pd.DataFrame(X_test.tolist())
# print('Shape of X_test=>',X_test.shape)

#### Using Pre-trained Word2Vec

In [None]:
from gensim.models import KeyedVectors

# path of the downloaded model
filename = '/kaggle/input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin'
# load into gensim
w2vec = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
# returns vector reperesentation of a given word if it is present in vocabulary
def get_pre_trained_embedding_w2v(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(300)
    else:
        for tok in doc_tokens:
            if tok in w2vec.wv.vocab:
                embeddings.append(w2vec.wv.word_vec(tok))
            else:
                embeddings.append(np.random.rand(300))
        # mean the vectors of individual words to get the vector of the statement
        return np.mean(embeddings, axis=0)

In [None]:
X=df['lemmatized'].apply(lambda x :get_pre_trained_embedding_w2v(x.split()))
X=pd.DataFrame(X.tolist())
print('Shape of X=>',X.shape)

In [None]:
X_test=df_test['lemmatized'].apply(lambda x:get_pre_trained_embedding_w2v(x.split()))
X_test=pd.DataFrame(X_test.tolist())
print('Shape of X_test=>',X_test.shape)

### Modeling

In [None]:
target=df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

In [None]:
#Dataframe for final probabilties
prob=pd.DataFrame(columns=['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'],index=df_test.index)
prob['id']=df_test['id']

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#Training Models
for index,value in enumerate(['toxic','severe_toxic','obscene','threat','insult','identity_hate']):
    print('{} - Model:\n'.format(value))
    
    y=target[:,index]
    print('Y=>',y)
    
    #Preparing Test Model
    # splitting train and test set
    x_train, x_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=42)
    #Test Model
    test_model=LogisticRegression(random_state=42)
    test_model=test_model.fit(x_train,y_train)
    #In-sample Evaluation
    train_pred=test_model.predict(x_train)
    print('In-sample Evaluation ROC-AUC Score:\n',roc_auc_score(y_train,train_pred))
    #Out-sample Evaluation
    test_pred=test_model.predict(x_test)
    print('Out-sample Evaluation ROC-AUC Score\n',roc_auc_score(y_test,test_pred))
    
    #Preparing Final Model on whole dataset
    model=LogisticRegression(random_state=42)
    model = model.fit(X, y)
    y_pred=model.predict(X)
    print('In-sample Evaluation on Whole Dataset ROC-AUC Score:\n',roc_auc_score(y,y_pred))
    print('Model=>',model)
    prob[value]=model.predict_proba(X_test)[:, 1]

In [None]:
prob

In [None]:
prob.to_csv('submission-LR-w2v-pre-trained-all.csv',index=False)