# Malignant Comments Project

In [None]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Data Reading and Understanding

In [None]:
# import train dataset
df_train = pd.read_csv("../input/malignant-comments/Malignant_train.csv")
df_train.head()

In [None]:
# import test dataset
df_test = pd.read_csv("../input/malignant-comments/Malignant_test.csv")
df_test.head()

In [None]:
# check shape of the train and test dataset
print(df_train.shape)
print(df_test.shape)

In our train dataset we have 159571 rows and 8 features, while in test dataset 153164 rows and 2 features are present.

In [None]:
# check information of train data
print(df_train.info())

# check information of test data
print(df_test.info())

We don't have any null value in our train and test dataset. 

In [None]:
# check null values of train data using heatmap
sns.heatmap(df_train.isnull())

From the above heatmap, we can clearly see that there is no null value found in our dataset.

In [None]:
# check discriptive statistics of the train dataset
df_train.describe(include='all')

1. From the above table, we can see that there no duplicate data present in comment_text column.
2. All numerical columns have only two values i.e. 0 and 1.

In [None]:
# check correlation of numerical features using heatmap
sns.heatmap(df_train.corr(), annot=True)

## Data Analysis

In [None]:
# check count plot of all target features. 
column = ['malignant','highly_malignant','loathe','rude','abuse','threat']
for i in column:
    print(i)
    print('\n')
    print(df_train[i].value_counts())
    sns.countplot(df_train[i])
    plt.show()

Above all count plot is our target feature and we can see that there is imbalanced classification and this issue affect our final result. So, we will use oversampling method in further process.

In [None]:
# create a label feature, which is combination of all target columns.
all_labels = ['malignant','highly_malignant','rude','threat','abuse','loathe']
df_train['Label'] = df_train[all_labels].sum(axis=1)

In [None]:
df_train.head(8)

In [None]:
# plot label column count
plt.figure(figsize=(9,5))
sns.countplot(df_train['Label'])
plt.title("Label Count",fontsize=20)
plt.show()

From the above plot we can say that, most of the comments are good and very less numbers of comments is bad.(0=good comments and others are bad comments.)

In [None]:
# Here, we convert label column in form of 0 and 1 (scaling).
# 0 = good comments and 1 = bad comments
df_train['Label'] = df_train['Label']>0
df_train['Label'] = df_train['Label'].astype(int)

In [None]:
df_train.head(8)

In [None]:
# Here, we plot our label column
sns.countplot(df_train['Label'])
plt.show()

df_train['Label'].value_counts()

From the above count plot we can see that, even after merge all target columns, it is still imbalanced. So, to solve this issue we use oversampling method in further process. 

In [None]:
# Now, we plot wordcloud of malignant comments and see which type word is most used in malignant comments.
from wordcloud import WordCloud
hams = df_train['comment_text'][df_train['malignant']==1]
spam_cloud = WordCloud(width=750,height=500,background_color='black',max_words=45).generate(' '.join(hams))
plt.figure(figsize=(10,8),facecolor='k')
plt.imshow(spam_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# Now, we plot wordcloud of abuse comments and see which type word is most used in abuse comments.
hams = df_train['comment_text'][df_train['abuse']==1]
spam_cloud = WordCloud(width=750,height=500,background_color='black',max_words=45).generate(' '.join(hams))
plt.figure(figsize=(10,8),facecolor='k')
plt.imshow(spam_cloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

## Data Preparation

In [None]:
# import useful libraries
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stop = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [None]:
# calculating comments length
df_train["comment_length"] = df_train["comment_text"].str.len()

In [None]:
# Convert all comments text into lower case
df_train['Cleaned_comment_text'] = df_train['comment_text'].str.lower()
df_test['cleaned_comment_text'] = df_test['comment_text'].str.lower()

In [None]:
# remove punctuation from cleaned comment text column
df_train['Cleaned_comment_text'] = df_train['Cleaned_comment_text'].str.replace('[^\w\s]','')
df_test['cleaned_comment_text'] = df_test['cleaned_comment_text'].str.replace('[^\w\s]','')

In [None]:
# removing stopwords from cleaned comment text column
df_train['Cleaned_comment_text'] = df_train['Cleaned_comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_test['cleaned_comment_text'] = df_test['cleaned_comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_train.head()

In [None]:
# Now, let's remove digits from the cleaned comment text column
df_train['Cleaned_comment_text'] = df_train['Cleaned_comment_text'].str.replace('\d+', '')
df_test['cleaned_comment_text'] = df_test['cleaned_comment_text'].str.replace('\d+', '')

In [None]:
# Here, we use Lemmatizing. Lemmatization is the process of converting a word to its base form.
df_train['Cleaned_comment_text'] = df_train['Cleaned_comment_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(x) for x in x.split()))
df_test['cleaned_comment_text'] = df_test['cleaned_comment_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(x) for x in x.split()))
df_train.head()

In [None]:
# calculating cleaned comments length
df_train["Cleaned_comment_length"] = df_train["Cleaned_comment_text"].str.len()
df_train.head()

In [None]:
# total reduced length
print("Original Length: ",df_train['comment_length'].sum())
print("Cleaned Length: ",df_train['Cleaned_comment_length'].sum())

In [None]:
# convert text into vectors using TF-IDF
tf_vec = TfidfVectorizer(max_features=8000, stop_words='english')
feature = tf_vec.fit_transform(df_train['Cleaned_comment_text'])

# split the target column.
# here, our target column is label and it is a classification problem.
x = feature
y = df_train['Label']

In [None]:
# convert test data's text into vectors using TF-IDF
tf_vec = TfidfVectorizer(max_features=8000, stop_words='english')
feature_test = tf_vec.fit_transform(df_test['cleaned_comment_text'])

In [None]:
# Hear, we use SMOTE(resampling) method to cop up with imbalanced classification of target variable.
# SMOTE(Synthetic Minority Oversampling Technique) algorithm generates synthetic samples of minority class.
x_smote,y_smote = smote.fit_resample(x,y)

# check shape of the train dataset before oversampling
print(x.shape)
print(y.shape)

# check shape of the train dataset after oversampling 
print(x_smote.shape)
print(y_smote.shape)

In [None]:
# check the counts of original target column
y.value_counts()

In [None]:
# check the count of target column after oversampling 
y_smote.value_counts()

In [None]:
# split train and test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size=0.20,random_state=0)

## Find Best Model

In [None]:
# our problem is classification type of problem.
# import useful libraries for machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

model = [LogisticRegression(solver='liblinear'),DecisionTreeClassifier(),MultinomialNB()]

for m in model:
    m.fit(x_train,y_train)
    train = m.score(x_train,y_train)
    predm = m.predict(x_test)
    print("Accuracy of",m,"is:")
    print("Accuracy of training model is:",train)
    print("Accuracy Score:",accuracy_score(y_test,predm))
    print("Confusion matrix:","\n",confusion_matrix(y_test,predm))
    print("Classification report:","\n",classification_report(y_test,predm))
    print("************************************************************")
    print("\n")

## Bagging and Boosting methods

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
train = rf.score(x_train,y_train)
pred_rf=rf.predict(x_test)
print("Accuracy of training model is:",train)
print("Accuracy Score:",accuracy_score(y_test,pred_rf))
print("Confusion matrix:","\n",confusion_matrix(y_test,pred_rf))
print("Classification report:","\n",classification_report(y_test,pred_rf))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
parameters = {'learning_rate':[0.01,0.1]}
clf = GridSearchCV(gbc,parameters)
clf.fit(x_train,y_train)
print(clf.best_params_)

In [None]:
gbc = GradientBoostingClassifier(learning_rate=0.1)
gbc.fit(x_train,y_train)
train = gbc.score(x_train,y_train)
predgbc = gbc.predict(x_test)
print("Accuracy of training model is:",train)
print("Accuracy Score:",accuracy_score(y_test,predgbc)*100)
print("Confusion matrix:","\n",confusion_matrix(y_test,predgbc))
print("Classification report:","\n",classification_report(y_test,predgbc))

In [None]:
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(x_train,y_train)
train = xg.score(x_train,y_train)
predxg = xg.predict(x_test)
print("Accuracy of training model is:",train)
print("Accuracy Score:",accuracy_score(y_test,predxg))
print("Confusion matrix:","\n",confusion_matrix(y_test,predxg))
print("Classification report:","\n",classification_report(y_test,predxg))

In [None]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier()
cb.fit(x_train,y_train)
train = cb.score(x_train,y_train)
predcb = cb.predict(x_test)
print("Accuracy of training model is:",train)
print("Accuracy Score:",accuracy_score(y_test,predcb))
print("Confusion matrix:","\n",confusion_matrix(y_test,predcb))
print("Classification report:","\n",classification_report(y_test,predcb))  

1. We use some algorithms and we find randomforest classifier as best model. It gives 99% training model accuracy and 97% testing accuracy. Randomforest classifier also gives good precision and recall score along with f1 score.
2. Here, we don't use hyperparameter tuning because it takes too much time as well as some algorithm also gives memory error.

In [None]:
# check auc_roc curve and auc score of best model
from sklearn.metrics import roc_curve,auc
fpr,tpr,thresholds = roc_curve(pred_rf,y_test)
roc_auc = auc(fpr,tpr)

plt.figure()
plt.plot(fpr,tpr,color="orange", lw=3, label=("ROC curve (area = %0.2f)" % roc_auc))
plt.plot([0,1],[0,1],color = "navy",lw=3,linestyle="--")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("RandomForest Classifier")
plt.legend(loc = "lower right")
plt.show()

From the above plot we can see that, we get best area under the curve for randomforest classifier, which is 98%.

In [None]:
#save best result
df1 = pd.DataFrame(pred_rf)
df1.to_csv("rf_malignant.csv")
#save best model
import joblib
joblib.dump(rf,"rf_malignant.obj")

In [None]:
# check our test dataset with best model
test_dataset = rf.predict(feature_test)
print(test_dataset)

In [None]:
#save test dataset result
df2 = pd.DataFrame(test_dataset)
df2.to_csv("rf_malignant_test.csv")