In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import zipfile

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## 1. Files Unzipping

In [None]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
with zipfile.ZipFile(train_path,"r") as z:
    z.extractall()
df = pd.read_csv('./train.csv')

test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
with zipfile.ZipFile(test_path,"r") as z:
    z.extractall()
real_test_df = pd.read_csv('./test.csv')

## 2.Minor Analysis

In [None]:
##Splitting dataset
split_test_df = df.iloc[-1000:,:]
df = df.iloc[:-1000,:]

print('Percentage of comments that are not labelled:')
print(len(df[(df['toxic']==0) &
             (df['severe_toxic']==0) & 
             (df['obscene']==0) &
             (df['threat']== 0) & 
             (df['insult']==0) & 
             (df['identity_hate']==0)])/len(df))

In [None]:
## min max and average length of texts

lens = df.comment_text.str.len()
lens.mean(), lens.std(), lens.max()

## found no empty text
df['is_empty'] = np.where(len(df['comment_text'])<1, 1,0)
df[df['is_empty']==1]

In [None]:
pip install contractions

## 3. Text Refinement

In [None]:
## removing those two turkish words
def text_refining(text):
    new_text = []
    for word in text.split():
        try:
            new_word = contractions.fix(word)  
        except:
            new_word = word
            
        finally:
            new_text.append(new_word)
    
    return " ".join(new_text)

df['comment_text'] = df.comment_text.apply(lambda text: text_refining(text))
split_test_df['comment_text'] = split_test_df.comment_text.apply(lambda text: text_refining(text))


## 4. Tfidf Vectorizer 1-gram

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=10000)

X = tfidf_vectorizer.fit_transform(df.comment_text.values)
X_test = tfidf_vectorizer.transform(split_test_df.comment_text.values)

real_test_df['comment_text'] = real_test_df.comment_text.apply(lambda text: text_refining(text))
real_X_test = tfidf_vectorizer.transform(real_test_df.comment_text.values)

## 5. Wordcloud

In [None]:
import math
from wordcloud import WordCloud
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
count_label = {}

fig, axes = plt.subplots(math.ceil(len(label_columns)/3),3,figsize = (25,15))
for n,col in enumerate(label_columns):
    check_df = df[df[col]>0]
    words = ' '.join(check_df.comment_text.values)
    count_label[col] = len(check_df)
    wordcloud = WordCloud(
                      background_color='white',
                      width=2500,
                      height=2000
                     ).generate(words)
    
    axes[n//3,n%3].imshow(wordcloud)
    axes[n//3,n%3].axis('off')
    axes[n//3,n%3].set_title(col)

    


In [None]:
#imbalance Data labels
pd.DataFrame.from_dict(count_label, orient = 'index',columns = ['count'] ).count

## 6. logistic model per label

In [None]:
import numpy as np
result = real_test_df.id.values.reshape(-1,1)
clf = Pipeline([('ovr',LogisticRegression(solver='sag', n_jobs=-1))])
for label in label_columns:
    print("-----------------{0}---------------".format(label))
    clf.fit(X,df[label].values)
    pred = clf.predict(real_X_test).reshape(-1,1)
    result = np.concatenate((result,pred), axis = 1)
    print(pred.shape,type(pred))
    #print("accuracy : ", accuracy_score(split_test_df[label].values,pred))
    
pd.DataFrame(result,columns = ['id','toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']).to_csv('./submission.csv', index=False)

## One Vs Rest

In [None]:
from sklearn.multiclass import OneVsRestClassifier
clf = Pipeline([('ovr',OneVsRestClassifier(LogisticRegression(solver='sag', n_jobs=-1)))])
clf.fit(X,df[label_columns].values)

In [None]:
clf_pred = clf.predict(real_X_test)

In [None]:
pd.DataFrame(np.concatenate((real_test_df.id.values.reshape(-1,1),clf_pred), axis = 1),columns = ['id','toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']).to_csv('./submission.csv', index=False)