Importing Modules

In [1]:
import numpy as np
import pandas as pd

Reading CSV File

In [2]:
df = pd.read_csv('./Abusive comments analysis dataset (Responses) - Abusive comments analysis dataset (Responses).csv')
df=df.dropna()

In [None]:
import re
stopwords = ['cheyagalgindi','cheyyagalgindi','cheyyagaligindi'
    'gurinchi',
    'pai',
    'prakaram',
    'anugunanga', 'anukulanga', 'atlane',
    'addamga',
    'nijamga',
    'tarvata',
    'malli','malla'
    'vyathirekanga',
    'kadu',
    'andaru',
    'anumathichi',
    'anumathistundi',
    'daadapu', 'deggar deggarga',
    'matrame',
    'venta','enta'
    'ippatike',
    'kuda',
    'aiyte',
    'eppudu',
    'odda','vadda',
    'madhya','madya',
    'okkate',
    'inka',
    'inkokati',
    'ye','ehh',
    'evaro okaru',
    'emaina gaani','emainappatiki'
    'evaraina',
    'edaina','yedaina',
    'emainappatiki','yemainappatiki',
    'ekkadaina','yekkadaina',
    'veruga',
    'kanipistayi',
    'mechuko',
    'sakkaga','tagina'
    'unnaru',
    'kaadu',
    'chuttu',
    'ga','gaa'
    'oka pakkana',
    'adagandi','adagali',
    'adagadam',
    'sambandam',
    'odda','vadda',
    'andubatulo',
    'duranga']
from nltk.stem.porter import PorterStemmer
corpus = []
for i in df['Comment']:
    comment = re.sub('[^a-zA-Z]',' ',i)
    comment = comment.lower()
    comment = comment.split()
    ps = PorterStemmer()
    comment = [ps.stem(word) for word in comment if word not in set(stopwords)]
    comment = ' '.join(comment)
    corpus.append(comment)


Seperating features and labels

In [4]:
y = df.iloc[:,1].values
print(df.iloc[:,-1].value_counts())


Abusive
No     4786
Yes    1433
Name: count, dtype: int64


Label Encoding 'y'

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(y)

y = le.transform(y)

Using TF-IDF Vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus).toarray()
print(vectorizer.get_feature_names_out())
x.shape

['aa' 'aaa' 'aaaa' ... 'zuckerberg' 'zulubar' 'zz']


(6219, 10186)

Splitting Training and test dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1,stratify=y)

In [10]:
print(X_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
print(X_train.shape,y_train.shape)
print(y_train)

(4975, 10186) (4975,)
[1 0 1 ... 1 0 1]


Training Model with Linear SVC

In [12]:
from sklearn.svm import SVC
lsvc = SVC(kernel='linear',C=1)
lsvc.fit(X_train, y_train)

Training Model with Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='log_loss',splitter='best')
dtc.fit(X_train, y_train)

Training Model with KNN Classifier

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)

Training Model with Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)

Training Model with Naive Bayes Classifier

In [16]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

Training Model with Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 20, random_state = 0)
rfc.fit(X_train, y_train)

Training Model with RBF SVC

In [18]:
from sklearn.svm import SVC
rsvc = SVC(kernel='rbf')
rsvc.fit(X_train, y_train)

Training Model with XGBoost Classifier

In [19]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

In [20]:
from sklearn.ensemble import StackingClassifier
classifiers = [('decisiontree',dtc),
               ('randomforest',rfc),
               ('linearsvc',lsvc),
               ('rbfsvc',rsvc),
               ('xgboost',xgbc),
               ('naivebayes',nb),
               ('logisticregression',lr),
               ('knearestneighbors',knn)
               ]

stack_model = StackingClassifier(estimators=classifiers,final_estimator=lsvc)
stack_model.fit(X_train,y_train)

Testing the model

In [21]:
text = ["Pandi"]
text = vectorizer.transform(text).toarray()
ans=stack_model.predict(text)
print(ans[0])

0


Accuracy and confusion matrix

In [22]:
from sklearn.metrics import accuracy_score,confusion_matrix
predictions = stack_model.predict(X_test)
cm = confusion_matrix(y_test,predictions)
print(cm)
print(accuracy_score(y_test,predictions))

[[949   8]
 [119 168]]
0.8979099678456591


K-Fold cross validation

In [23]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=stack_model,X=X_train,y=y_train,cv=10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.78 %
Standard Deviation: 1.12 %


In [None]:
import pickle
filename = 'teluguAbusiveCommentDetector.sav'
# pickle.dump(stack_model,open(filename,'wb'))
pickle.dump(vectorizer,open('tfidf.sav','wb'))