In [335]:
import requests
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


%matplotlib inline 


In [254]:
df = pd.read_csv('clean_data1.csv',)

In [255]:
df = df[['title','target']]

In [256]:
df.head()

Unnamed: 0,title,target
0,anyone figured out how hide the user interface...,0
1,payment issue,0
2,recommendation,0
3,idea what netflix can about constant cancellin...,0
4,time difference different country,0


In [257]:
df.shape

(20630, 2)

In [258]:
X= df['title']
y= df['target']


In [259]:
#  Checking the proportion
y.value_counts(normalize=True)

1    0.501697
0    0.498303
Name: target, dtype: float64

In [260]:
# Split the data into the Training and Testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=42)

In [261]:
X_test

3672                         what everyone watching today 
306                       time drop your subscription boy 
3417        bad vegan spoiler help understanding the scam 
15400    converted @ poster awesome official moon knigh...
12973    disney date announcement coming really soon fo...
                               ...                        
10433    had the opportunity meet acting legend carl we...
15071    think disney plus might pushing this film the ...
11035          adding hulu and espn bundle prepaid disney 
4167       question about the ending the innocent spoiler 
7939                        better call saul season recap 
Name: title, Length: 4126, dtype: object

### Stopwords using sklearn

In [262]:
# A quick look at stop words

stop_words = list(_stop_words.ENGLISH_STOP_WORDS)
print(stop_words)
print(len(stop_words))

['there', 'further', 'ten', 'its', 'has', 'everyone', 'almost', 'move', 'because', 'each', 'more', 'so', 're', 'elsewhere', 'one', 'against', 'whereafter', 'front', 'whereby', 'eight', 'amoungst', 'but', 'very', 'fill', 'that', 'always', 'which', 'nine', 'three', 'fire', 'becomes', 'much', 'get', 'may', 'yours', 'seems', 'are', 'along', 'name', 'noone', 'sincere', 'side', 'hers', 'such', 'under', 'everything', 'describe', 'sometime', 'former', 'back', 'namely', 'yourself', 'next', 'alone', 'many', 'since', 'might', 'here', 'through', 'whom', 'i', 'in', 'had', 'other', 'out', 'empty', 'first', 'ourselves', 'nothing', 'something', 'see', 'what', 'all', 'ever', 'we', 'couldnt', 'wherever', 'below', 'ours', 'thence', 'himself', 'same', 'either', 'must', 'now', 'anyway', 'again', 'from', 'when', 'my', 'became', 'sixty', 'done', 'becoming', 'who', 'and', 'own', 'about', 'anyhow', 'their', 'forty', 'un', 'someone', 'whoever', 'our', 'themselves', 'been', 'with', 'bottom', 'fifteen', 'both', '

## CountVectorizer - Model 1

In [449]:
# Instantiate a Countvectorizer
# Ignoring the works that occur in 98% of the documents
# A word must occut in atleast two documents from the corpus
# Adding Stopwords from Sklearn dataset

cvec= CountVectorizer()

In [450]:
# Transform the corpus
X_train = cvec.fit_transform(X_train)

In [451]:
X_train

<15472x3479 sparse matrix of type '<class 'numpy.int64'>'
	with 124472 stored elements in Compressed Sparse Row format>

In [452]:
# Convert X_train into a DataFrame.

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,aaron,abandon,abbey,abbott,abby,abc,abercrombie,ability,able,about,...,zeen,zeke,zero,zoey,zone,zoomed,zootopia,zootropolis,zorro,zutomayo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15468,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
15469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [453]:
# Transform test
X_test = cvec.transform(X_test)
X_test_df = pd.DataFrame(X_test.toarray(), # without toarray it cannot be converted to dense vector 
                         columns=cvec.get_feature_names())
X_test_df

Unnamed: 0,aaron,abandon,abbey,abbott,abby,abc,abercrombie,ability,able,about,...,zeen,zeke,zero,zoey,zone,zoomed,zootopia,zootropolis,zorro,zutomayo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5153,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [454]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [455]:
# Baseline Accuracy is always majority class
y_test.value_counts(normalize=True) # Accuracy is always majority class

1    0.501745
0    0.498255
Name: target, dtype: float64

### Pipeline for CountVectorizer and Logsitic Regression


In [456]:
# Let's set it up with two stages:
# 1. An instance of CountVectorizer (transformer)
# 2. A LogisticRegression instance (estimator)

pipe= Pipeline([('cvec',CountVectorizer()),

                ('lr', LogisticRegression())])

In [457]:
# Evaluate how your model will perform on unseen data

cross_val_score(pipe, X_train, y_train, cv=10).mean() 

0.9972854019365647

In [458]:
# Fit your model
pipe.fit(X_train, y_train)

Pipeline(steps=[('cvec', CountVectorizer()), ('lr', LogisticRegression())])

In [459]:
# Training score
pipe.score(X_train, y_train)

0.9976085832471562

In [460]:
# Test score
pipe.score(X_test, y_test)

0.9970918960837534

### GridsearchCV for CountVectorizer and LogisticRegression

In [461]:
# Search over the following values of hyperparameters:
# Maximum number of features fit: 1000, 4000, 6000
# Minimum number of documents needed to include token: 2, 3
# Maximum number of documents needed to include token: 95%, 98%
# Check (individual tokens) and also check (individual tokens and bigrams).

pipe_params = {'cvec__max_features': [1000,4000,6000],
              'cvec__min_df' :[2,3],
               'cvec__max_df' :[0.95,0.98],
               'cvec__ngram_range' :[(1,1),(1,2),(1,3)]
              }

In [462]:
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid = pipe_params, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [463]:
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.95, 0.98],
                         'cvec__max_features': [1000, 4000, 6000],
                         'cvec__min_df': [2, 3],
                         'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)]})

In [464]:
# What's the best score?
print(gs.best_score_)


0.9968975128264305


In [465]:
# Save best model as gs_model.
#Pipeline(steps=[('cvec',
                # CountVectorizer(max_df=0.95, max_features=4000, min_df=2)),
                #('lr', LogisticRegression())])
gs_model = gs.best_estimator_
gs_model

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.95, max_features=4000, min_df=2)),
                ('lr', LogisticRegression())])

In [466]:
# Score model on training set.
gs_model.score(X_train, y_train)

0.9976085832471562

In [467]:
# Score model on testing set.
gs_model.score(X_test, y_test)

0.9970918960837534

In [468]:
predictions_CV = gs_model.predict(X_test)

In [469]:
confusion_matrix(y_test, predictions_CV)

array([[2567,    3],
       [  12, 2576]])

In [470]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions1).ravel()

In [471]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 2567
False Positives: 3
False Negatives: 12
True Positives: 2576


## TF IDF - Term Frequency Inverse Document Frequency  - Model 2

In [282]:
# Fit the transformer.
tvec = TfidfVectorizer()

In [290]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [291]:
X_train = tvec.fit_transform(X_train)

X_test = tvec.transform(X_test)

In [285]:
# Instantiate logistic regression.
lr = LogisticRegression()

# Fit logistic regression.
lr.fit(X_train, y_train)

# Evaluate logistic regression.
print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

Training Score: 0.9956049638055843
Testing Score: 0.9932144241954246


### GridsearchCV for TFIDF and LogisticRegression

In [434]:
pipe1= Pipeline([('tvec', TfidfVectorizer()),

                ('lr', LogisticRegression())])

In [435]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

In [436]:
cross_val_score(pipe1, X_train, y_train, cv=10).mean() 

0.9925026599787202

In [437]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('tvec', TfidfVectorizer()), ('lr', LogisticRegression())])

In [311]:
# Search over the following values of hyperparameters:
# Maximum number of features fit: 1000, 4000, 6000
# Minimum number of documents needed to include token: 2, 3
# Maximum number of documents needed to include token: 95%, 98%
# Check (individual tokens) and also check (individual tokens and bigrams).


pipe_params = {
            'tvec__stop_words' : [None, 'english'],
            'tvec__ngram_range' : [(1,1), (1,2), (1,3)],
            'tvec__max_df' : [.95, 0.98],
            'tvec__min_df' : [2, 3],
            'tvec__max_features' : [1000, 4000, 6000]
}

In [312]:
# Instantiate GridSearchCV.

gs1 = GridSearchCV(pipe1, # what object are we optimizing?
                  param_grid =pipe_params , # what parameters values are we searching?
                  cv=5, n_jobs=-1, verbose=1) # 5-fold cross-validation.

In [313]:
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('lr', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'tvec__max_df': [0.95, 0.98],
                         'tvec__max_features': [1000, 4000, 6000],
                         'tvec__min_df': [2, 3],
                         'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'tvec__stop_words': [None, 'english']},
             verbose=1)

In [314]:
# What's the best score?
print(gs1.best_score_)

0.9915975367405568


In [318]:
# Save best model as gs_model.
#Pipeline(steps=[('tvec',
                # TfidfVectorizer(max_df=0.95, max_features=4000, min_df=2)),
                #('lr', LogisticRegression())])
gs_model = gs1.best_estimator_
gs_model

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_df=0.95, max_features=4000, min_df=2)),
                ('lr', LogisticRegression())])

In [316]:
# Score model on training set.
gs_model.score(X_train, y_train)

0.9956049638055843

In [317]:
# Score model on testing set.
gs_model.score(X_test, y_test)

0.9932144241954246

In [438]:
predictions1 = gs_model.predict(X_test)

In [439]:
confusion_matrix(y_test, predictions1)

array([[2567,    3],
       [  12, 2576]])

In [442]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions1).ravel()

In [443]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 2567
False Positives: 3
False Negatives: 12
True Positives: 2576


## BernoulliNB , MultinomialNB, GaussianNV - Model 3

In [348]:
# reinitiating train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [349]:
#Instantiate our CountVectorizer.
cvec1 = CountVectorizer(max_features=4000, stop_words=stop_words)

In [350]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec =cvec1.fit_transform(X_train)

In [351]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = cvec1.transform(X_test)

In [352]:
pd.DataFrame(X_train_cvec.toarray(), columns=cvec1.get_feature_names())

Unnamed: 0,aaron,abandon,abbey,abbott,abby,abc,abercrombie,ability,able,absolute,...,zeen,zeke,zero,zoey,zone,zoomed,zootopia,zootropolis,zorro,zutomayo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### BernoulliNB

In [353]:
# Instantiate our model!

bnb = BernoulliNB()

In [354]:
# Fit our model!

bnb.fit(X_train_cvec, y_train)

BernoulliNB()

In [355]:
# Generate our predictions!

predictions = bnb.predict(X_test_cvec)

In [356]:
# Score our model on the training set.

bnb.score(X_test_cvec, y_test)

0.9656843737882901

In [357]:
# Score our model on the testing set.
bnb.score(X_train_cvec, y_train)


0.9689762150982419

In [358]:
confusion_matrix(y_test, predictions)

array([[2532,   38],
       [ 139, 2449]])

In [359]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [360]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 2532
False Positives: 38
False Negatives: 139
True Positives: 2449


### MultinomialNB

In [361]:
# Similarly Multinomial NB
mnb = MultinomialNB()

mnb.fit(X_train_cvec, y_train)

pred_multinomial = mnb.predict(X_test_cvec)

print(mnb.score(X_train_cvec, y_train))

print(mnb.score(X_test_cvec, y_test))

0.9707213029989659
0.9680108569212873


In [362]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_multinomial).ravel()
print(f"True Negatives: {tn}")
print(f"Fale Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 2513
Fale Positives: 57
False Negatives: 108
True Positives: 2480


### GaussianNB

In [363]:
gnb = GaussianNB()

gnb.fit(X_train_cvec.todense(), y_train)

pred_gnb = gnb.predict(X_test_cvec.todense())

print(gnb.score(X_train_cvec.todense(), y_train))

print(gnb.score(X_test_cvec.todense(), y_test))

0.9657445708376422
0.9637456378441256


In [364]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_gnb).ravel()
print(f"True Negatives: {tn}")
print(f"Fale Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 2549
Fale Positives: 21
False Negatives: 166
True Positives: 2422


## RandomForest - Model  4

In [347]:
# reinitiating train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [365]:
#Instantiate our CountVectorizer.
cvec2 = CountVectorizer(max_features=4000, stop_words=stop_words)

In [366]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec =cvec2.fit_transform(X_train)

In [367]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = cvec2.transform(X_test)

In [368]:
pd.DataFrame(X_train_cvec.toarray(), columns=cvec1.get_feature_names())

Unnamed: 0,aaron,abandon,abbey,abbott,abby,abc,abercrombie,ability,able,absolute,...,zeen,zeke,zero,zoey,zone,zoomed,zootopia,zootropolis,zorro,zutomayo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [369]:
# Model Instantiation
# Create an instance of RandomForestClassifier and ExtraTreesClassifier.

rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [370]:
#Model Evaluation for rf and et
cross_val_score(rf,X_train_cvec,y_train, cv=10).mean()

0.9979317308318677

In [371]:
#Model Evaluation for rf and et
cross_val_score(et,X_train_cvec,y_train, cv=10).mean()

0.9979317308318677

In [383]:
rf_params = {
    'n_estimators': [50,75,100,500],
    'max_depth': [10,15,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,3,5]
}

In [384]:
# GridSearch

gs3= GridSearchCV(estimator = RandomForestClassifier(random_state = 42),param_grid=rf_params, cv=10, verbose=1, n_jobs=-1)

gs3.fit(X_train_cvec,y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 30],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 75, 100, 500]},
             verbose=1)

In [386]:
# Best Score

gs3.best_score_

0.893743704995415

In [387]:
# Best Parameters
gs3.best_params_

{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}

In [388]:
# Assigning to best_estimators
best_rf = gs3.best_estimator_

In [389]:
# fit train data

best_rf.fit(X_train_cvec, y_train)

RandomForestClassifier(max_depth=30, n_estimators=500, random_state=42)

In [390]:
# Score
best_rf.score(X_train_cvec, y_train)

0.8956825232678387

In [391]:
# Test Score
best_rf.score(X_test_cvec, y_test)

0.8937572702597906

In [432]:
pred_rf = best_rf.predict(X_test_cvec.todense())

In [433]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_rf).ravel()
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 2570
False Positives: 0
False Negatives: 548
True Positives: 2040


### Support Vector Machine - Model 5

In [396]:
# reinitiating train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [397]:
#Instantiate our CountVectorizer.
cvec2 = CountVectorizer(max_features=4000, stop_words=stop_words)

In [398]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec =cvec2.fit_transform(X_train)

In [399]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = cvec2.transform(X_test)

In [400]:
pd.DataFrame(X_train_cvec.toarray(), columns=cvec1.get_feature_names())

Unnamed: 0,aaron,abandon,abbey,abbott,abby,abc,abercrombie,ability,able,absolute,...,zeen,zeke,zero,zoey,zone,zoomed,zootopia,zootropolis,zorro,zutomayo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15467,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15469,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [401]:
# Instantiate support vector machine.
svc = SVC()

In [402]:
# Fit support vector machine to training data.
svc.fit(X_train_cvec, y_train)


SVC()

In [403]:
# Generate predictions.

y_pred = svc.predict(X_test_cvec)

In [404]:
# Measure performance based on accuracy.
svc.score(X_train_cvec, y_train)



0.9976085832471562

In [405]:
# Mesure of test score
svc.score(X_test_cvec, y_test)

0.9970918960837534

In [409]:
# Create DataFrame with column for predicted values.
results = pd.DataFrame(svc.predict(X_test_cvec), columns=['predicted'])

# Create column for observed values.
results['actual'] = y_test
results

Unnamed: 0,predicted,actual
0,0,
1,1,0.0
2,1,
3,1,
4,1,0.0
...,...,...
5153,0,
5154,0,
5155,1,
5156,1,


In [412]:
svm_params = {
    'kernel':['linear','rbf','polynomial']
}

In [413]:
gs5= GridSearchCV(estimator = SVC(random_state = 42),param_grid=svm_params, cv=10, verbose=1, n_jobs=-1)

gs5.fit(X_train_cvec,y_train)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


Traceback (most recent call last):
  File "/Users/ram/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ram/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/Users/ram/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 294, in _sparse_fit
    kernel_type = self._sparse_kernels.index(kernel)
ValueError: 'polynomial' is not in list

Traceback (most recent call last):
  File "/Users/ram/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ram/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/Users/ram/opt/anacon



GridSearchCV(cv=10, estimator=SVC(random_state=42), n_jobs=-1,
             param_grid={'kernel': ['linear', 'rbf', 'polynomial']}, verbose=1)

In [414]:
# Best Score

gs5.best_score_

0.9973501684513997

In [415]:
# Best Parameters
gs5.best_params_

{'kernel': 'linear'}

In [416]:
# Assigning to best_estimators
best_svm = gs5.best_estimator_

In [426]:
# fit train data

best_svm.fit(X_train_cvec, y_train)

SVC(kernel='linear', random_state=42)

In [427]:
# Score
best_svm.score(X_train_cvec, y_train)

0.9976085832471562

In [428]:
# test Score
best_svm.score(X_test_cvec, y_test)

0.9970918960837534

In [430]:
pred_svm = best_svm.predict(X_test_cvec.todense())

In [431]:
tn, fp, fn, tp = confusion_matrix(y_test, pred_svm).ravel()
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

True Negatives: 2567
False Positives: 3
False Negatives: 12
True Positives: 2576
