In [30]:
import nltk, pandas as pd, numpy as np, string, joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
#nltk.download_shell()

In [2]:
df=pd.read_csv('messages.csv')

In [3]:
df.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [4]:
df.shape

(2893, 3)

In [5]:
df=df.drop_duplicates()

In [6]:
df.shape

(2876, 3)

In [7]:
df.isna().sum()

subject    62
message     0
label       0
dtype: int64

In [8]:
df=df.dropna()

In [9]:
df.shape

(2814, 3)

In [10]:
def clean_text(text):
    nopunc=[txt for txt in text if text not in string.punctuation]
    nopunc=''.join(nopunc)
    
    clean_words=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    #clean_words=''.join(clean_words)
    
    return clean_words

In [11]:
def join_func(text1,text2):
    text=text1.strip()+' '+text2.strip()
    
    return text

In [12]:
df['subject message']=df.apply(lambda x:join_func(x['subject'],x['message']),axis=1).reset_index().drop('index',axis=1)

In [13]:
df.isna().sum()

subject             0
message             0
label               0
subject message    79
dtype: int64

In [14]:
df=df.dropna(axis=0)

In [15]:
message_bow=CountVectorizer(analyzer=clean_text).fit_transform(df['subject message'])

In [16]:
message_bow.shape

(2735, 63393)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(message_bow, df['label'], test_size=0.25, random_state=101,)

In [18]:
model=MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()

In [19]:
predict=model.predict(X_test)

In [20]:
print(confusion_matrix(predict,y_test))
print()
print(classification_report(predict,y_test))

[[513 107]
 [ 57   7]]

              precision    recall  f1-score   support

           0       0.90      0.83      0.86       620
           1       0.06      0.11      0.08        64

    accuracy                           0.76       684
   macro avg       0.48      0.47      0.47       684
weighted avg       0.82      0.76      0.79       684



In [22]:
model=RandomForestClassifier()
model.fit(X_train,y_train)

RandomForestClassifier()

In [23]:
predict=model.predict(X_test)

In [24]:
print(confusion_matrix(predict,y_test))
print()
print(classification_report(predict,y_test))

[[561 113]
 [  9   1]]

              precision    recall  f1-score   support

           0       0.98      0.83      0.90       674
           1       0.01      0.10      0.02        10

    accuracy                           0.82       684
   macro avg       0.50      0.47      0.46       684
weighted avg       0.97      0.82      0.89       684



In [27]:
g_params={'n_estimators':[100,150,200,250,300]}
gscv=GridSearchCV(RandomForestClassifier(),g_params)

In [28]:
gscv.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [100, 150, 200, 250, 300]})

In [29]:
gscv.best_params_

{'n_estimators': 100}

In [31]:
joblib.dump(model,'RandomForest NLP.obj')

['RandomForest NLP.obj']