In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report 

In [2]:
train_data = pd.read_csv('tokenized.csv', usecols=['fraudulent','lemm_tokens'])
train_data.head()

Unnamed: 0,fraudulent,lemm_tokens
0,0,"['we', 'food52', 'create', 'groundbreaking', '..."
1,0,"['90', 'second', 'worlds', 'cloud', 'video', '..."
2,0,"['valor', 'service', 'provide', 'workforce', '..."
3,0,"['our', 'passion', 'improve', 'quality', 'life..."
4,0,"['spotsource', 'solutions', 'llc', 'global', '..."


In [3]:
print('Shape of training data :',train_data.shape)

Shape of training data : (17880, 2)


In [4]:
y = train_data["fraudulent"]
X = train_data["lemm_tokens"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

In [6]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(14304, 81083)
(3576, 81083)


# Check the words 'learnt' by vectorizer

In [7]:
tfidf.vocabulary_

{'unspecified': 72826,
 'what': 76669,
 'casumo': 10843,
 'company': 13513,
 'technology': 69297,
 'startup': 65649,
 'form': 27950,
 '2012': 557,
 'product': 52923,
 'recognize': 55900,
 'award': 7179,
 'win': 76862,
 'online': 45805,
 'casino': 10820,
 'concept': 14078,
 'though': 70339,
 'literally': 38471,
 'whole': 76748,
 'new': 44338,
 'universe': 72684,
 'we': 76234,
 'work': 77137,
 'change': 11375,
 'common': 13207,
 'perception': 48529,
 'through': 70383,
 'design': 18440,
 'amp': 4039,
 'innovation': 33876,
 'aim': 3583,
 'disrupt': 20265,
 'young': 78072,
 'many': 40204,
 'ways': 76191,
 'immature': 32535,
 'industry': 33312,
 'ignore': 32362,
 'rule': 59877,
 'found': 28113,
 'upon': 72985,
 'single': 62714,
 'promise': 54076,
 'want': 76052,
 'deliver': 17786,
 'world': 77453,
 'best': 8263,
 'game': 28883,
 'experience': 25422,
 'no': 44569,
 'bullet': 9609,
 'brand': 9093,
 'book': 8856,
 'believe': 8061,
 'understand': 72530,
 'data': 16816,
 'play': 50080,
 'key': 36

# SGDClassifier

In [8]:
sgd = SGDClassifier(loss="modified_huber", penalty="l2", shuffle=True, random_state=420)
sgd.fit(X_train_tfidf, y_train)
y_predicted_sgd = sgd.predict(X_test_tfidf)
print(classification_report(y_test, y_predicted_sgd))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3428
           1       0.98      0.81      0.89       148

    accuracy                           0.99      3576
   macro avg       0.98      0.90      0.94      3576
weighted avg       0.99      0.99      0.99      3576



# Logistic Regression

In [9]:
log = LogisticRegression(C=42.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', random_state=42)
log.fit(X_train_tfidf, y_train)
y_predicted_log = log.predict(X_test_tfidf)

print(classification_report(y_test, y_predicted_log))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      3428
           1       0.84      0.89      0.86       148

    accuracy                           0.99      3576
   macro avg       0.92      0.94      0.93      3576
weighted avg       0.99      0.99      0.99      3576



# Complement Naive Bayes 

In [10]:
cnb = ComplementNB()
cnb.fit(X_train_tfidf, y_train)
y_predicted_cnb = cnb.predict(X_test_tfidf)

print(classification_report(y_test, y_predicted_cnb))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3428
           1       0.14      0.01      0.01       148

    accuracy                           0.96      3576
   macro avg       0.55      0.50      0.50      3576
weighted avg       0.93      0.96      0.94      3576



# Random Forest Classifier

In [12]:
regr=RandomForestClassifier(n_estimators = 200, max_features = "sqrt", random_state = 420)
regr.fit(X_train_tfidf,y_train)
y_predicted_regr = regr.predict(X_test_tfidf)

print(classification_report(y_test,y_predicted_regr))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3428
           1       1.00      0.71      0.83       148

    accuracy                           0.99      3576
   macro avg       0.99      0.85      0.91      3576
weighted avg       0.99      0.99      0.99      3576

