In [63]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

In [62]:
model_dir=os.path.join("..","Models")

In [2]:
final_data_path=os.path.join("..","Data","final_data_for_ml_model.csv")

In [3]:
final_data=pd.read_csv(final_data_path)

In [4]:
final_data

Unnamed: 0,num_single_quote_error,num_spacing_error,num_social_media_handles,num_urls_text,num_twitter_post_urls,number_of_hastags,num_of_absence_of_space_after_sentence_completion,num_capitalized_words,num_of_absence_capitalization_after_full_stop,num_of_words_of_pattern_2017word,num_mispelled_words,label
0,7,6,0,0,0,0,4,2,2,0,14,0
1,12,6,1,1,0,0,3,6,1,0,25,0
2,3,11,5,0,2,4,8,12,4,0,23,0
3,0,3,0,0,0,0,0,2,0,0,4,1
4,0,2,0,0,0,0,2,0,0,0,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...
44893,4,3,0,0,0,0,0,9,4,0,6,1
44894,1,5,0,0,0,0,1,25,1,0,14,0
44895,5,14,0,0,0,0,4,0,0,0,3,0
44896,0,5,0,0,0,0,3,6,1,0,10,1


In [5]:
data_X=final_data.drop('label',axis=1,inplace=False)

In [6]:
data_Y=final_data['label']

# TRAIN VALIDATION SPLIT

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_X,val_X,train_Y,val_Y=train_test_split(data_X,data_Y,test_size=0.2,random_state=42)

# EVALUATION FUNCTION

In [9]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [10]:
def evaluate_model(ytrue,ypred):
    print(f"Accuracy: {accuracy_score(ytrue,ypred)}")
    print(f"Precision: {precision_score(ytrue,ypred)}")
    print(f"Recall: {recall_score(ytrue,ypred)}")
    print(f"F1 Score: {f1_score(ytrue,ypred)}")
    

In [11]:
from sklearn.model_selection import GridSearchCV

# MODEL SELECTION

## Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model=LogisticRegression()

In [14]:
params={
    'penalty':['l1','l2','elasticnet'],
    'C':[1.0,0.5,2],
    'max_iter':[100,75,125],
    'warm_start':[True,False]
}

In [15]:
logistic_reg=GridSearchCV(estimator=model,param_grid=params,return_train_score=True)

In [16]:
logistic_reg.fit(train_X,train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### EVALUATE MODEL

In [17]:
ypred=logistic_reg.predict(val_X)

In [18]:
evaluate_model(val_Y,ypred)

Accuracy: 0.8632516703786192
Precision: 0.8230978830433872
Recall: 0.9109255393180237
F1 Score: 0.864787491741907


### BEST PARAMS

In [19]:
logistic_reg.best_params_

{'C': 2, 'max_iter': 100, 'penalty': 'l2', 'warm_start': True}

### SAVE MODEL

In [65]:
joblib.dump(logistic_reg,os.path.join(model_dir,"logistic_reg.pkl"))

['../Models/logistic_reg.pkl']

## Polynomial Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
from sklearn.preprocessing import PolynomialFeatures

In [22]:
from sklearn.pipeline import Pipeline

In [23]:
model=Pipeline([('poly_features',PolynomialFeatures()),('logistic_reg',LogisticRegression())])

In [24]:
params={
    'poly_features__degree':[1,2,3],
    'logistic_reg__penalty':['l1','l2','elasticnet'],
    'logistic_reg__C':[1.0,0.5,2],
    'logistic_reg__max_iter':[100,75,125],
    'logistic_reg__warm_start':[True,False]
}

In [25]:
poly_logistic_reg=GridSearchCV(estimator=model,param_grid=params)

In [26]:
poly_logistic_reg.fit(train_X,train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### EVALUATE MODEL

In [27]:
ypred=poly_logistic_reg.predict(val_X)

In [28]:
evaluate_model(val_Y,ypred)

Accuracy: 0.8828507795100222
Precision: 0.8208308722189407
Recall: 0.9670610067269775
F1 Score: 0.8879659211927583


### BEST PARAMS

In [29]:
poly_logistic_reg.best_params_

{'logistic_reg__C': 0.5,
 'logistic_reg__max_iter': 125,
 'logistic_reg__penalty': 'l2',
 'logistic_reg__warm_start': True,
 'poly_features__degree': 2}

### SAVE MODEL

In [66]:
joblib.dump(poly_logistic_reg,os.path.join(model_dir,"poly_logistic_reg.pkl"))

['../Models/poly_logistic_reg.pkl']

## SVC

In [30]:
from sklearn.svm import SVC

In [31]:
svc_model=SVC(C=2,kernel='rbf',gamma="auto")

In [32]:
svc_model.fit(train_X,train_Y)

### EVALUATE MODEL

In [33]:
ypred=svc_model.predict(val_X)

In [34]:
evaluate_model(val_Y,ypred)

Accuracy: 0.9259465478841871
Precision: 0.9367513176808816
Recall: 0.9069821387149153
F1 Score: 0.9216263995285798


### SAVE MODEL

In [67]:
joblib.dump(svc_model,os.path.join(model_dir,"svc.pkl"))

['../Models/svc.pkl']

## Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
model=DecisionTreeClassifier()

In [40]:
params={
    'criterion':['gini','entropy','log_loss'],
    'splitter':['best','random'],
    'max_depth':[3,6,8],
    'max_features':['sqrt','log2',5],
    'max_leaf_nodes':[None,2]
}

In [41]:
decision_tree_model=GridSearchCV(estimator=model,param_grid=params)

In [43]:
decision_tree_model.fit(train_X,train_Y)

### EVALUATE MODEL

In [44]:
ypred=decision_tree_model.predict(val_X)

In [45]:
evaluate_model(val_Y,ypred)

Accuracy: 0.9243875278396436
Precision: 0.901060070671378
Recall: 0.9464161447459986
F1 Score: 0.9231813553569408


### BEST PARAMS

In [46]:
decision_tree_model.best_params_

{'criterion': 'log_loss',
 'max_depth': 8,
 'max_features': 5,
 'max_leaf_nodes': None,
 'splitter': 'best'}

### SAVE MODEL

In [68]:
joblib.dump(decision_tree_model,os.path.join(model_dir,"decision_tree.pkl"))

['../Models/decision_tree.pkl']

## Random Forest Classifier

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
model=RandomForestClassifier()

In [55]:
params={
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[3,6,8],
    'max_features':['sqrt','log2',5],
    'max_leaf_nodes':[None,2],
    'n_estimators':[100,75,125],
    'bootstrap':[True,False],
    'warm_start':[True,False]
}

In [56]:
random_forest_model=GridSearchCV(estimator=model,param_grid=params)

In [57]:
random_forest_model.fit(train_X,train_Y)

### EVALUATE MODEL

In [59]:
ypred=random_forest_model.predict(val_X)

In [60]:
evaluate_model(val_Y,ypred)

Accuracy: 0.9373051224944321
Precision: 0.9207453973955995
Recall: 0.9512874043145442
F1 Score: 0.9357672561323446


### BEST PARAMS

In [61]:
random_forest_model.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 5,
 'max_leaf_nodes': None,
 'n_estimators': 75,
 'warm_start': True}

### SAVE MODEL

In [69]:
joblib.dump(random_forest_model,os.path.join(model_dir,"random_forest.pkl"))

['../Models/random_forest.pkl']

## VOTING CLASSIFIER

In [75]:
from sklearn.ensemble import VotingClassifier

In [77]:
voting_classifier_model=VotingClassifier(
    estimators=[
        ('logistic_reg',logistic_reg),
        ('poly_logistic_reg',poly_logistic_reg),
        ('decision_tree',decision_tree_model)
    ],
    voting='soft',
    weights=[1,1,2]
)

In [78]:
voting_classifier_model.fit(train_X,train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### EVALUATE MODEL

In [79]:
ypred=voting_classifier_model.predict(val_X)

In [80]:
evaluate_model(val_Y,ypred)

Accuracy: 0.9342984409799554
Precision: 0.9126192060323797
Recall: 0.9545349106935745
F1 Score: 0.9331065759637188


### SAVE MODEL

In [81]:
joblib.dump(voting_classifier_model,os.path.join(model_dir,"voting_classifier.pkl"))

['../Models/voting_classifier.pkl']

## STACKING CLASSIFIER

In [83]:
from sklearn.ensemble import StackingClassifier

In [86]:
from sklearn.svm import SVC

In [87]:
stacking_classifier=StackingClassifier(
    estimators=[
        ("logistic_reg",logistic_reg),
        ("poly_logistic",poly_logistic_reg),
        ("decison_tree",decision_tree_model)
    ],
    final_estimator=SVC(),
    passthrough=False
     
)

In [None]:
stacking_classifier.fit(train_X,train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### EVALUATE MODEL

In [None]:
ypred=stacking_classifier.predict(val_X)

In [None]:
evaluate_model(val_Y,ypred)

### SAVE MODEL

In [None]:
joblib.dump(stacking_classifier,os.path.join(model_dir,"stacking_classifier"))

## STACKING AND VOTING HYBRID CLASSIFIER

In [None]:
voting_classifier_model=VotingClassifier(
    estimators=[
        ("logistic_reg",logistic_reg),
        ("poly_logistic_reg",poly_logistic_reg),
        ("decision_tree",decision_tree_model)
    ],
    voting="soft",
    weights=[1,1,2]
)

In [None]:
stacking_voting_hybrid_classifier=StackingClassifier(
    estimators=[
         ("logistic_reg",logistic_reg),
        ("poly_logistic_reg",poly_logistic_reg),
        ("decision_tree",decision_tree_model)
    ],
    final_estimator=voting_classifier_model,
    passthrough=False
)