In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_ds = pd.read_csv('../input/twitter-hate-speech/train_E6oV3lV.csv')
test_ds = pd.read_csv('../input/twitter-hate-speech/test_tweets_anuFYb8.csv')

In [None]:
train_ds.head()

In [None]:
test_ds.head()

In [None]:
import re
def tweet_cleaner(tweet) :
    tweet = re.sub("(@[A-Za-z0-9]+)|(#)|(RT[\s]+)|(https?:\/\/\S+)|([^a-zA-Z0-9 -])", "", tweet)
    return tweet

In [None]:
train_ds['tweet'] = train_ds['tweet'].apply(tweet_cleaner)
train_ds

In [None]:
train_ds.head()

In [None]:
test_ds['tweet'] = test_ds['tweet'].apply(tweet_cleaner)
test_ds

In [None]:
train_ds.drop('id',inplace = True, axis = 1)
train_ds

In [None]:
test_ds.drop('id',inplace = True, axis = 1)
test_ds

In [None]:
from textblob import TextBlob
def getSubjectivity(tweet) :
    return TextBlob(tweet).sentiment.subjectivity
def getPolarity(tweet) :
    return TextBlob(tweet).sentiment.polarity
def getAnalysis(score) :
    return 'Neutral' if (score == 0) else ('Negative' if (score < 0) else 'Positive')

In [None]:
train_ds['Subjectivity'] = train_ds['tweet'].apply(getSubjectivity)
train_ds['Polarity'] = train_ds['tweet'].apply(getPolarity)
train_ds['Analysis'] = train_ds['Polarity'].apply(getAnalysis)

train_ds

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_ds["tweet"], train_ds["label"], test_size = 0.20, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(sublinear_tf=True)

X_train_cnt = count_vect.fit_transform(X_train)
X_train_TF = transformer.fit_transform(X_train_cnt)
print(X_train_cnt.shape)
print(X_train_TF.shape)

X_test_cnt = count_vect.transform(X_test)
X_test_TF = transformer.transform(X_test_cnt)
print(X_test_cnt.shape)
print(X_test_TF.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFClass
ranForModel = RFClass(n_estimators=10, criterion = "entropy")
ranForModel.fit(X_train_TF, Y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_re
from sklearn.metrics import confusion_matrix as c_m
ranForPredict = ranForModel.predict(X_test_TF)
print("Predicted Class:",ranForPredict)
print("Confusion Matrix:\n",c_m(Y_test ,ranForPredict))
print("Accuracy:", accuracy_score(Y_test ,ranForPredict))
print("F_score:", f1_score(Y_test ,ranForPredict))
print("Classification Report:\n",class_re(Y_test ,ranForPredict))

In [None]:
from wordcloud import WordCloud as WC
WordsInAllTweets = ''.join([words for words in train_ds['tweet']])
wordcloud = WC(width = 1000, height = 600, random_state = 42, max_font_size = 120).generate(WordsInAllTweets)


pos_train_ds = train_ds[train_ds['Analysis'] == "Positive"]
WordsInPosTweets = ''.join([words for words in pos_train_ds['tweet']])
pos_wordcloud = WC(width = 1000, height = 600, random_state = 42, max_font_size = 120).generate(WordsInPosTweets)

neg_train_ds = train_ds[train_ds['Analysis'] == "Negative"]
WordsInNegTweets = ''.join([words for words in neg_train_ds['tweet']])
neg_wordcloud = WC(width = 1000, height = 600, random_state = 42, max_font_size = 120).generate(WordsInNegTweets)

neu_train_ds = train_ds[train_ds['Analysis'] == "Neutral"]
WordsInNeuTweets = ''.join([words for words in neu_train_ds['tweet']])
neu_wordcloud = WC(width = 1000, height = 600, random_state = 42, max_font_size = 120).generate(WordsInNeuTweets)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.figure(figsize = (20,28))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,28))
plt.imshow(pos_wordcloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,28))
plt.imshow(neu_wordcloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,28))
plt.imshow(neg_wordcloud, interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score as acc_score
from sklearn.metrics import classification_report as class_re
from sklearn.preprocessing import StandardScaler

train_ds.drop('tweet',inplace = True, axis = 1)
train_ds.Analysis = train_ds.Analysis.map({"Neutral":0, "Negative":-1, "Positive":+1})
col_names = ["Subjectivity", "Polarity", "Analysis"]
target_name = ["label"]


X = train_ds[col_names]
X_std_scal = StandardScaler().fit_transform(X)
Y = train_ds[target_name]

X_train, X_test, Y_train, Y_test = train_test_split(X_std_scal, Y, test_size = 0.20, random_state = 42)

DTreeClass = DecisionTreeClassifier(criterion = "entropy", random_state = 42, max_depth = 7)
DTreeClass.fit(X_train, Y_train)
Y_pred = DTreeClass.predict(X_test)
Y_scored = DTreeClass.score(X,Y)
Y_scored2 = DTreeClass.score(X_train,Y_train)
Y_scored3 = DTreeClass.score(X_test,Y_test)


print("Classification Report:\n",class_re(Y_test, Y_pred))
print("Confusion Matrix:\n",c_m(Y_test ,Y_pred))
print("F_score:", f1_score(Y_test ,Y_pred))
print("Accuracy:", acc_score(Y_test, Y_pred))
print("Predicted Class:",Y_pred)
print("Scored Class (From all data):",Y_scored)
print("Scored Class (From training data):",Y_scored2)
print("Scored Class (From testing data):",Y_scored3)


In [None]:
train_ds

In [None]:
from sklearn.tree import export_graphviz
import graphviz

treePic = export_graphviz(DTreeClass, out_file = None, filled = True, rounded = True,special_characters = True, feature_names = col_names ,class_names = ['0','1'])

graph = graphviz.Source(treePic)
graph.render("decision_tree", view = True)
graph

In [None]:
treePic = export_graphviz(DTreeClass, out_file = 'DTreeClass.dot', filled = True, rounded = True,special_characters = True, feature_names = col_names ,class_names = ['1','0'])

!dot -Tpng DTreeClass.dot -o DTreeClass.png -Gdpi=600

In [None]:
from IPython.display import Image
Image(filename = 'DTreeClass.png', width = 800, height = 300)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

corr = train_ds.corr()
graph1 = sns.heatmap(corr,  vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt='.2f', cmap='coolwarm')
sns.despine()
graph1.figure.set_size_inches(28,20)
    
plt.show()

In [None]:
graph2 = sns.violinplot(y = "Subjectivity",x = "label", data = train_ds, size = 28)
sns.despine()
graph2.figure.set_size_inches(28,20)
plt.show()

In [None]:
graph3 = sns.violinplot(y = "Polarity",x = "label", data = train_ds, size = 28)
sns.despine()
graph3.figure.set_size_inches(28,20)
plt.show()

In [None]:
graph4 = sns.violinplot(y = "Analysis",x = "label", data = train_ds, size = 28)
sns.despine()
graph4.figure.set_size_inches(28,20)
plt.show()

In [None]:
graph5 = sns.pairplot(train_ds, hue = "label", height = 5.0)