In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
df = pd.read_csv('../Dataset/mergedData.csv')
label = df['label']

In [3]:
tf = TfidfVectorizer(ngram_range=(1,2))

In [4]:
tf_vec = tf.fit_transform(df['text'])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    tf_vec,label, test_size=0.3, shuffle=True, random_state=42,)

In [6]:
parameters = {
    'n_estimators':[100,200,300],
    'max_depth': [70, 80 , 90, 100]
}

In [7]:
model = RandomForestClassifier()

In [8]:
clf = GridSearchCV(model,param_grid=parameters,scoring='accuracy',cv = 2)

In [9]:
clf.fit(tf_vec,label)

GridSearchCV(cv=2, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [70, 80, 90, 100],
                         'n_estimators': [100, 200, 300]},
             scoring='accuracy')

In [14]:
clf.best_params_

{'max_depth': 80, 'n_estimators': 100}

In [10]:
model = RandomForestClassifier(n_estimators=100,max_depth=80)

In [11]:
model.fit(x_train,y_train)

RandomForestClassifier(max_depth=80)

In [12]:
cross_val_score(model,tf_vec,label, scoring='accuracy')

array([0.89197531, 0.77954145, 0.94223986, 0.90427878, 0.89016321])

In [13]:
pred = model.predict(x_test)

In [14]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1591
           1       0.97      0.95      0.96      1811

    accuracy                           0.96      3402
   macro avg       0.96      0.96      0.96      3402
weighted avg       0.96      0.96      0.96      3402



In [15]:
with open('tfidf_tokenizer.pkl','wb') as f:
    pickle.dump(tf,f)

with open('random_forest.pkl', 'wb') as f:
    pickle.dump(model,f)