# Implementing LogisticRegression algorithms

import pandas as pd
import numpy as np

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math as m

In [2]:
data = pd.read_csv('A.csv')

In [3]:
data.head()

Unnamed: 0,description,fraudulent
0,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1
1,The group has raised a fund for the purchase o...,1
2,Technician Instrument &amp; ControlsLocation D...,1
3,Sales Executive,1
4,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1


In [4]:
data.shape

(1660, 2)

In [5]:
df = data.dropna()

In [6]:
df['fraudulent'].unique()

array([1, 0], dtype=int64)

In [7]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
# remove whitespaces
df['description']=df['description'].str.strip()
# lowercase the text
df['description'] = df['description'].str.lower()
#remove punctuation
punc = string.punctuation
table = str.maketrans('','',punc)
df['description']=df['description'].apply(lambda x: x.translate(table))
# tokenizing each message
df['description']=df.apply(lambda x: x['description'].split(' '),axis=1)
# removing stopwords
df['description'] = df.apply(lambda x: [word for word in x['description'] if word not in stopwords.words('english')],axis=1)
# stemming
ps = PorterStemmer()
df['description']= df.apply(lambda x: [ps.stem(word) for word in x['description']],axis=1)
# remove single letter words
df['description'] = df.apply(lambda x: ' '.join([word for word in x['description'] if len(word)>1]),axis=1)


In [9]:
import numpy as np
X = np.array(df["description"])
y = np.array(df["fraudulent"])

In [10]:
y[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_X = cv.fit_transform(X) # Fit the Data

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cv_X, y, test_size=20, random_state=42)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [14]:
PredictRFC = lr.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, PredictRFC)*100
print('Accuracy of LogisticRegression is: ', m.ceil(acc))

Accuracy of LogisticRegression is:  90


In [16]:
from sklearn.metrics import confusion_matrix
print('Confusuion matrix of LogisticRegression\n',confusion_matrix(y_test,PredictRFC))

Confusuion matrix of LogisticRegression
 [[11  1]
 [ 1  7]]


In [17]:
from sklearn.metrics import classification_report
print('Classification report of LogisticRegression\n\n',classification_report(y_test,PredictRFC))

Classification report of LogisticRegression

               precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.88      0.88      0.88         8

    accuracy                           0.90        20
   macro avg       0.90      0.90      0.90        20
weighted avg       0.90      0.90      0.90        20



In [18]:
import joblib
joblib.dump(cv,'lr_tv.pkl')
joblib.dump(lr,'lr.pkl')

['lr.pkl']