In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle
import re
from sklearn.metrics import confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score

# Question 1

In [2]:
df = pd.read_csv('rotten-tomatoes.csv.bz2')

### Question 1.1

In [3]:
df.sample(10)

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
4023,Kenneth Turan,fresh,115678,http://www.calendarlive.com/movies/reviews/cl-...,Los Angeles Times,As delicately and deliciously prepared as the ...,2001-02-14 00:00:00,13183,Big Night
9145,Joe Leydon,fresh,120812,http://www.variety.com/review/VE1117913350.htm...,Variety,Raucously entertaining!,2000-01-01 00:00:00,10201,Rush Hour
1666,Mick LaSalle,rotten,113305,http://www.sfgate.com/cgi-bin/article.cgi?f=/c...,San Francisco Chronicle,Higher Learning presents a profoundly uninspir...,2000-01-01 00:00:00,13586,Higher Learning
2256,Rita Kempley,rotten,107207,http://www.washingtonpost.com/wp-srv/style/lon...,Washington Post,The film takes forever to do what 60 Minutes d...,2000-01-01 00:00:00,13519,In the Name of the Father
6678,Susan Stark,fresh,119051,http://data.detnews.com/movies/details.hbs?myr...,Detroit News,Anthony Hopkins' first action movie casts him ...,2000-01-01 00:00:00,13371,The Edge
2917,Claudia Puig,fresh,426931,http://www.usatoday.com/community/utils/idmap/...,USA Today,"August Rush will not be for everyone, but it w...",2007-11-22 00:00:00,770413356,August Rush
3599,Peter Rainer,fresh,47396,http://www.newyorkmetro.com/nymetro/movies/rev...,New York Magazine,The clearest example of a Hitchcock movie that...,2000-01-01 00:00:00,10046,Rear Window
13164,Jeff Millar,rotten,134119,http://www.chron.com/cs/CDA/moviestory.mpl/ae/...,Houston Chronicle,"Perhaps at 90 or so minutes, it would have bee...",2000-01-01 00:00:00,13323,The Talented Mr. Ripley
10474,Peter Travers,rotten,133093,http://www.rollingstone.com/movies/reviews/the...,Rolling Stone,The Matrix soars with its feet in the air -- t...,2001-05-11 00:00:00,12897,The Matrix
11769,Emanuel Levy,fresh,162360,http://www.variety.com/review/VE1117488147.htm...,Variety,"Steve Zahn shines in Illsley's feature debut, ...",2007-01-29 00:00:00,10925,"Happy, Texas"


### Question 1.2

In [4]:
print('The columns are:', df.columns)

The columns are: Index(['critic', 'fresh', 'imdb', 'link', 'publication', 'quote',
       'review_date', 'rtid', 'title'],
      dtype='object')


### Question 1.3

In [5]:
df.isna().sum()

critic         705
fresh            0
imdb             0
link             0
publication      0
quote            0
review_date      0
rtid             0
title            0
dtype: int64

In [6]:
print("1.) The number of NA's in fresh are", df['fresh'].isna().sum())
print("\n2.) The number of NA's in quote are", df['quote'].isna().sum())
print('\n3.) The different values in the fresh column are:', df['fresh'].unique())
print('\n4.) The counts of the unique values in the fresh column are:\n', df['fresh'].value_counts())
count = 0
for i in range(len(df['quote'])):
    if len(df['quote'][i]) == 0 or df['quote'][i].isspace() is not False:
        count += 1
print('\n5.) The number of zero length or whitespace:', count)
print()
print('6.)', df['quote'].apply(len).describe()[['mean', 'min', 'max']])
print('\n7.) The number of reviews that are there multiple times', df[df.duplicated() == True].shape[0])

1.) The number of NA's in fresh are 0

2.) The number of NA's in quote are 0

3.) The different values in the fresh column are: ['fresh' 'rotten' 'none']

4.) The counts of the unique values in the fresh column are:
 fresh     8389
rotten    5030
none        23
Name: fresh, dtype: int64

5.) The number of zero length or whitespace: 0

6.) mean    121.23129
min       4.00000
max     256.00000
Name: quote, dtype: float64

7.) The number of reviews that are there multiple times 596


### Question 1.4

In [7]:
def clean(data):
    new_df = data[(data.fresh != 'none') & (data.duplicated() == False)]
    return new_df

In [8]:
new_df = clean(df)

In [9]:
new_df.shape

(12823, 9)

In [10]:
new_df1 = new_df.reset_index(drop = True)
new_df1.tail()

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
12818,Gene Siskel,rotten,88683,http://articles.chicagotribune.com/1985-09-13/...,Chicago Tribune,Agnes of God plays with some challenging ideas...,2013-05-08 00:00:00,11917,Agnes of God
12819,Variety Staff,rotten,88683,http://www.variety.com/review/VE1117796703.htm...,Variety,"Fonda's relentless interrogating, mannered cha...",2008-10-18 00:00:00,11917,Agnes of God
12820,,fresh,88683,http://www.timeout.com/film/reviews/77605/agne...,Time Out,Splendidly shot by Sven Nykvist and with excel...,2006-06-24 00:00:00,11917,Agnes of God
12821,Janet Maslin,rotten,88683,http://movies.nytimes.com/movie/review?res=950...,New York Times,"Miss Tilly makes a radiant Agnes, and Miss Ban...",2003-05-20 00:00:00,11917,Agnes of God
12822,Roger Ebert,rotten,88683,http://www.rogerebert.com/reviews/agnes-of-god...,Chicago Sun-Times,Although the movie deals in the basic material...,2000-01-01 00:00:00,11917,Agnes of God


# Question 2

### Question 2.2

In [11]:
vectorizer = CountVectorizer(binary=True)
# define vectorizer
X = vectorizer.fit_transform(new_df1.quote.values).toarray()
# vectorize your data. Note: this creates a sparce matrix,
# use .toarray() if you want a dense matrix.
words = vectorizer.get_feature_names()
# in case you want to see what are the actual words

### Question 2.3

In [12]:
# Splitting data into x and y
x = new_df1.loc[:, new_df1.columns != 'fresh']
y = new_df1['fresh']

# Using train test split and setting the random state to 2
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, random_state=2)

### Question 2.4

In [13]:
# Calculating the number of fresh's and rotten's
f = y_train.value_counts()['fresh']
r = y_train.value_counts()['rotten']

In [14]:
#Calculating the log probabilities
pr_fresh = np.log(f / (f + r))
pr_rotten = np.log(r / (f + r))
print('The log probability of fresh is:', pr_fresh)
print('The log probability of rotten is:', pr_rotten)

The log probability of fresh is: -0.47960307606247643
The log probability of rotten is: -0.9650321547865404


### Question 2.5

In [15]:
X = pd.DataFrame(X)
X.columns = words

In [16]:
fresh_index = y_train[y_train == 'fresh'].index.values
rotten_index = y_train[y_train == 'rotten'].index.values

In [17]:
X_rotten = pd.DataFrame(X[X.index.isin(rotten_index)].sum())
X_rotten = X_rotten.reset_index()
X_rotten.columns = ['word', 'count']
X_rotten = X_rotten[X_rotten['count'] != 0]
X_rotten['log_prob'] = np.log(X_rotten['count']/len(rotten_index))
X_rotten_new = X_rotten[['word', 'log_prob']].set_index(['word'])
X_rotten_new = X_rotten_new.to_dict()['log_prob']

In [18]:
X_fresh = pd.DataFrame(X[X.index.isin(fresh_index)].sum())
X_fresh = X_fresh.reset_index() 
X_fresh.columns = ['word', 'count']
X_fresh = X_fresh[X_fresh['count'] != 0]
X_fresh['log_prob'] = np.log(X_fresh['count']/len(fresh_index))
X_fresh_new = X_fresh[['word', 'log_prob']].set_index(['word'])
X_fresh_new = X_fresh_new.to_dict()['log_prob']

### Question 2.6

In [19]:
pred_values = []
for sentence in x_validation.quote:
    ans_fresh, ans_rotten = 0, 0
    temp = re.split(r'\W+', sentence.lower())
    for ele in temp:
        if ele != '' and ele in X_fresh_new.keys():
            ans_fresh += X_fresh_new[ele]
        if ele != '' and ele in X_rotten_new.keys():
            ans_rotten += X_rotten_new[ele]
    if pr_fresh + ans_fresh > pr_rotten + ans_rotten:
        pred_values.append('fresh')
    else:
        pred_values.append('rotten')

### Question 2.7

In [20]:
print('Confusion matrix:\n', confusion_matrix(y_validation, pred_values))
print('Accuracy:', accuracy_score(y_validation, pred_values))

Confusion matrix:
 [[632 973]
 [350 610]]
Accuracy: 0.4842105263157895


# Question 3

### Question 3.1

Before proceeding further, we decide to filter out stopwords from our data. These are generic words such as 'and', 'the', etc. which add no value to finding whether a movie is fresh or rotten.

In [21]:
# Dropping stopwords from rotten data
for i in X_rotten.index.values:
    if X_rotten.word[i] in stopwords.words('english'):
        X_rotten.drop([i], inplace=True)

In [22]:
# Dropping stopwords from fresh data
for i in X_fresh.index.values:
    if X_fresh.word[i] in stopwords.words('english'):
        X_fresh.drop([i], inplace=True)

In [23]:
# Extracting probabilites for words that occur with a frequency of more than 30
X_rotten_freq = X_rotten[X_rotten['count'] > 30]
X_fresh_freq = X_fresh[X_fresh['count'] > 30]
print("Fresh data for frequent words: \n", X_fresh_freq)
print("Rotten data for frequent words: \n", X_rotten_freq)

Fresh data for frequent words: 
           word  count  log_prob
378      acted     39 -5.092648
379     acting     63 -4.613075
380     action    158 -3.693615
389      actor     45 -4.949548
390     actors     81 -4.361761
...        ...    ...       ...
20720  written     32 -5.290474
20762     year     79 -4.386762
20767    years     85 -4.313559
20775      yet     79 -4.386762
20793    young     63 -4.613075

[366 rows x 3 columns]
Rotten data for frequent words: 
          word  count  log_prob
379    acting     44 -4.486591
380    action    111 -3.561251
390    actors     61 -4.159907
700    almost     60 -4.176436
710      also     40 -4.581902
...       ...    ...       ...
20664   world     36 -4.687262
20676   worst     41 -4.557209
20681   would     94 -3.727486
20762    year     38 -4.633195
20775     yet     38 -4.633195

[189 rows x 3 columns]


### Question 3.2

In [24]:
# Creating a new dataframe which includes conditional log probabilities for each word
new = pd.DataFrame(words)
new['rotten_prob'] = X_rotten_freq['log_prob']
new['fresh_prob'] = X_fresh_freq['log_prob']
# Imputing NAs with a large negative value (to represent negative infinity)
# This is to represent an infinitesimal (~0) probability of the word being present in a respective Fresh/Rotten quote
new.fillna(-999999, inplace=True)
# Renaming column to 'word'
new.rename(columns={0: 'word'}, inplace=True)
# Computing difference between conditional probabilities. 
# This is to find which words are most likely to appear in Fresh and which are more likely to appear in Rotten reviews.
new['diff'] = new['fresh_prob'] - new['rotten_prob']
print('The best 10 words for Rotten are:')
new.sort_values(by='diff', ascending=True).head(10)['word']

The best 10 words for Rotten are:


6782             feels
9656       interesting
20182             want
20676            worst
1712           becomes
19543    unfortunately
9987             jokes
12377            night
10347            lacks
5821            either
Name: word, dtype: object

In [25]:
print('The best 10 words for Fresh are:')
new.sort_values(by='diff', ascending=False).head(10)['word']

The best 10 words for Fresh are:


6104      entertaining
13396      performance
768           american
6106     entertainment
20767            years
3283           classic
18292             tale
13551            piece
13197             part
6055         enjoyable
Name: word, dtype: object

### Question 3.3

In [26]:
# Comparing predicted values with target values to find misclassified quotes
test = pd.DataFrame(y_validation)
test['predicted'] = pred_values
test['quote'] = x_validation['quote']
misclassified = test[test['fresh'] != test['predicted']]
misclassified.head(30)

Unnamed: 0,fresh,predicted,quote
3953,fresh,rotten,The greatness of The Battle of Algiers lies in...
5711,fresh,rotten,"A happy, heartfelt chapter that reunites the o..."
4271,fresh,rotten,"It is a funny picture - not too consistently, ..."
9673,fresh,rotten,A pity that the directors prove less ruthless ...
9120,rotten,fresh,"Neither Trey Callaway, who wrote the script, n..."
2862,fresh,rotten,It glows with sorrowful humor in its look at t...
5748,fresh,rotten,The fast-moving 124-minute film engenders enor...
1152,fresh,rotten,"It's about seduction, and either you succumb t..."
12795,rotten,fresh,I hope what they all got paid made it worth th...
6803,fresh,rotten,Carrey turns Truman into a postmodern Capra hero.


In [27]:
misclassified.iloc[19, :]['quote']

"Although I enjoyed portions of the movie, I can't wholeheartedly recommend something with such an unfortunate and dissatisfying conclusion!"

The above quote was classified as fresh but was actually for a rotten review. The word 'enjoyed' might have been the cause for this misclassification.

In [28]:
misclassified.iloc[22, :]['quote']

'The film, like its oddly rumbling sky, promises more than it ever delivers. Granted, it can look cool. But more often, as we wait for the lightning that never arrives, it frustrates.'

The above quote was classified as rotten but was actually for a fresh review. The word 'frustrates' might have been the casue for this misclassification.

# Question 4

### Question 4.1

In [29]:
def model(df):
     # Vectorize words in df, find is word is present in respective quote
    vectorizer = CountVectorizer(binary=True)
    X = pd.DataFrame(vectorizer.fit_transform(df.quote.values).toarray())
    words = vectorizer.get_feature_names()
    # Replacing the indexes of the columns with their respective names
    X.columns = words
    
    # Split data into fresh and rotten
    x = df.loc[:, df.columns != 'fresh']
    y = df['fresh']
    
    # Split the data into training and validation
    x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2)
    
    return x_train, x_validation, y_train, y_validation

def fitNB(df):
    x_train, x_validation, y_train, y_validation = model(df)
    
    f = y_train.value_counts()['fresh']
    r = y_train.value_counts()['rotten']
    
    # Computing the log probability of fresh and rotten
    pr_fresh = np.log(f /(f + r))
    pr_rotten = np.log(r /(f + r))
    
    # Compute indices of fresh and rotten elements in training set
    fresh_index = y_train[y_train == 'fresh'].index.values
    rotten_index = y_train[y_train == 'rotten'].index.values
    
    # Subset vectorized dataframe into fresh and rotten based on indices
    X_rotten = pd.DataFrame(X[X.index.isin(rotten_index)].sum()).rename(columns={0: 'count'})
    X_rotten = X_rotten[X_rotten['count'] != 0]
    X_rotten['log_prob'] = np.log(X_rotten['count']/len(rotten_index))
    X_rotten = X_rotten.to_dict()['log_prob']
    
    X_fresh = pd.DataFrame(X[X.index.isin(fresh_index)].sum()).rename(columns={0: 'count'})
    X_fresh = X_fresh[X_fresh['count'] != 0]
    X_fresh['log_prob'] = np.log(X_fresh['count']/len(fresh_index))
    X_fresh = X_fresh.to_dict()['log_prob']
    
    return pr_fresh, pr_rotten, X_rotten, X_fresh

def predict(df):
    pr_fresh, pr_rotten, X_rotten, X_fresh = fitNB(df)
    pred_values = []
    
    '''Looping through every quote and taking every word of the quote and assigning its repective fresh or rotten
        log probability and checking which one is greater and predicting the one greater for that quote '''
    
    for sentence in x_validation.quote:
        ans_fresh, ans_rotten = 0, 0
        temp = re.split(r'\W+', sentence.lower())
        for ele in temp:
            if ele != '' and ele in X_fresh_new.keys():
                ans_fresh += X_fresh_new[ele]
            if ele != '' and ele in X_rotten_new.keys():
                ans_rotten += X_rotten_new[ele]
        if pr_fresh + ans_fresh > pr_rotten + ans_rotten:
            pred_values.append('fresh')
        else:
            pred_values.append('rotten')
    
    # Getting the confusion matrix and the accuracy score of the predicted model
    cm = confusion_matrix(y_validation, pred_values)
    ac = accuracy_score(y_validation, pred_values)
    return cm, ac

In [30]:
# Using the predict function to run through the entire process done above
cm, ac = predict(new_df1)
print('The confusion matrix is:\n', cm)
print('The accuracy is:', ac)

The confusion matrix is:
 [[632 973]
 [351 609]]
The accuracy is: 0.4838206627680312


### Question 4.2

In [31]:
def model_new(df):
     # Vectorize words in df, find is word is present in respective quote
    vectorizer = CountVectorizer(binary=True)
    X = pd.DataFrame(vectorizer.fit_transform(df.quote.values).toarray())
    words = vectorizer.get_feature_names()
    X.columns = words
    
    # Split data into fresh and rotten
    x = df.loc[:, df.columns != 'fresh']
    y = df['fresh']
    
    # Split the data into training and validation
    x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2)
    
    return x_train, x_validation, y_train, y_validation, X

def fitNB_new(df, alpha, beta):
    x_train, x_validation, y_train, y_validation, X = model_new(df)
    
    # Compute log probabilities of fresh and rotten
    f = y_train.value_counts()['fresh']
    r = y_train.value_counts()['rotten']
    
    # Adding the values of alpha and beta in their respective places to perform smoothing
    pr_fresh = np.log((f + alpha)/(f + r + alpha + beta))
    pr_rotten = np.log((r + beta)/(f + r + alpha + beta))
    
    # Compute indices of fresh and rotten elements in training set
    fresh_index = y_train[y_train == 'fresh'].index.values
    rotten_index = y_train[y_train == 'rotten'].index.values
    
    # Subset vectorized dataframe into fresh and rotten based on indices
    # Adding beta to the count of the already existing counts of rotten
    X_rotten = pd.DataFrame(X[X.index.isin(rotten_index)].sum()).rename(columns={0: 'count'})
    X_rotten['count'] = np.where(X_rotten['count'] != 0, X_rotten['count'] + beta, beta)
    X_rotten['log_prob'] = np.log((X_rotten['count'])/(len(rotten_index) + beta))
    X_rotten = X_rotten.to_dict()['log_prob']

    # Adding alpha to the count of the already existing counts of fresh
    X_fresh = pd.DataFrame(X[X.index.isin(fresh_index)].sum()).rename(columns={0: 'count'})
    X_fresh['count'] = np.where(X_fresh['count'] != 0, X_fresh['count'] + alpha, alpha)
    X_fresh['log_prob'] = np.log((X_fresh['count'] + alpha)/(len(fresh_index) + alpha))
    X_fresh = X_fresh.to_dict()['log_prob']
    
    return pr_fresh, pr_rotten, X_rotten, X_fresh, x_validation, y_validation

def predict_new(df, alpha, beta):
    pr_fresh, pr_rotten, X_rotten, X_fresh, x_validation, y_validation = fitNB_new(df, alpha, beta)
    pred_values = []
    for sentence in x_validation.quote:
        ans_fresh, ans_rotten = 0, 0
        temp = re.split(r'\W+', sentence.lower())
        for ele in temp:
            if ele != '' and ele in X_fresh.keys():
                ans_fresh += X_fresh[ele]
            if ele != '' and ele in X_rotten.keys():
                ans_rotten += X_rotten[ele]
        if pr_fresh + ans_fresh > pr_rotten + ans_rotten:
            pred_values.append('fresh')
        else:
            pred_values.append('rotten')
    cm = confusion_matrix(y_validation, pred_values)
    ac = accuracy_score(y_validation, pred_values)
    return cm, ac

In [32]:
# Prediction done after smoothing
cm, ac = predict_new(new_df1, 0.02, 0.01)
print('The confusion matrix is:\n', cm)
print('The accuracy is:', ac)

The confusion matrix is:
 [[1396  212]
 [ 462  495]]
The accuracy is: 0.7372319688109161


### Question 4.3

In [33]:
def kfoldcv(df, k, alpha, beta): 
    # Dividing the data into k-splits
    temp = np.array_split(df, k)
    ac = []
    # Looping k times (one for each fold)
    for x in range(k):
        trainList = []
        for y in range(k):
            if y == x:
                validation = temp[y]
            else:
                trainList.append(temp[y])
        train = pd.concat(trainList)

        # Splitting traning into features and target
        x_train = train.loc[:, train.columns != 'fresh']
        y_train = train.loc[:, train.columns == 'fresh']
        
        # Splitting validation into features and target
        x_validation = validation.loc[:, validation.columns != 'fresh']
        y_validation = validation.loc[:, validation.columns == 'fresh'] 
        
        # Appending the accuracy to the ac variable and then returning the mean of the list
        ac.append(fitNB_alpha(df, x_train, x_validation, y_train, y_validation, alpha, beta))

    return np.mean(ac)

def vectorizer(df):
     # Vectorize words in df, find is word is present in respective quote
    vectorizer = CountVectorizer(binary=True)
    X = pd.DataFrame(vectorizer.fit_transform(df.quote.values).toarray())
    words = vectorizer.get_feature_names()
    X.columns = words
    return X

def fitNB_alpha(df, x_train, x_validation, y_train, y_validation, alpha, beta):
    X = vectorizer(df)
    
    # Compute log probabilities of fresh and rotten
    f = y_train['fresh'].value_counts()['fresh']
    r = y_train['fresh'].value_counts()['rotten']
    pr_fresh = np.log((f + alpha)/(f + r + alpha + beta))
    pr_rotten = np.log((r + beta)/(f + r + alpha + beta))
    
    # Compute indices of fresh and rotten elements in training set
    fresh_index = y_train[y_train == 'fresh'].index.values
    rotten_index = y_train[y_train == 'rotten'].index.values
    
    # Subset vectorized dataframe into fresh and rotten based on indices
    # Adding beta to the count of the already existing counts of rotten
    X_rotten = pd.DataFrame(X[X.index.isin(rotten_index)].sum()).rename(columns={0: 'count'})
    X_rotten['count'] = np.where(X_rotten['count'] != 0, X_rotten['count'] + beta, beta)
    X_rotten['log_prob'] = np.log((X_rotten['count'])/(len(rotten_index) + beta))
    X_rotten = X_rotten.to_dict()['log_prob']

    # Adding alpha to the count of the already existing counts of fresh
    X_fresh = pd.DataFrame(X[X.index.isin(fresh_index)].sum()).rename(columns={0: 'count'})
    X_fresh['count'] = np.where(X_fresh['count'] != 0, X_fresh['count'] + alpha, alpha)
    X_fresh['log_prob'] = np.log((X_fresh['count'] + alpha)/(len(fresh_index) + alpha))
    X_fresh = X_fresh.to_dict()['log_prob']
        
    return predict_alpha(pr_fresh, pr_rotten, X_rotten, X_fresh, x_validation, y_validation, alpha, beta)

def predict_alpha(pr_fresh, pr_rotten, X_rotten, X_fresh, x_validation, y_validation, alpha, beta):
    pred_values = []
    for sentence in x_validation.quote:
        ans_fresh, ans_rotten = 0, 0
        temp = re.split(r'\W+', sentence.lower())
        for ele in temp:
            if ele != '' and ele in X_fresh.keys():
                ans_fresh += X_fresh[ele]
            if ele != '' and ele in X_rotten.keys():
                ans_rotten += X_rotten[ele]
        if pr_fresh + ans_fresh > pr_rotten + ans_rotten:
            pred_values.append('fresh')
        else:
            pred_values.append('rotten')
    cm = confusion_matrix(y_validation, pred_values)
    ac = accuracy_score(y_validation, pred_values)
    return ac

In [36]:
temp = []
# Looping through the different values of alpha and beta to get the accuracy
# Returning the alpha, beta and accuracy as a list of list
for alpha in np.arange(0.01, 0.91, 0.2):
    for beta in np.arange(0.01, 0.91, 0.2):
        temp.append([alpha, beta, kfoldcv(new_df1, 5, alpha, beta)])

print('Alpha', 'Beta', 'Accuracy')     
for i, l in enumerate(temp):
    print(l)

Alpha Beta Accuracy
[0.01, 0.01, 0.6203635888125583]
[0.01, 0.21000000000000002, 0.4994158433003987]
[0.01, 0.41000000000000003, 0.4650267156885105]
[0.01, 0.6100000000000001, 0.4470139554120177]
[0.01, 0.81, 0.4371887553864727]
[0.21000000000000002, 0.01, 0.6203635888125583]
[0.21000000000000002, 0.21000000000000002, 0.6203635888125583]
[0.21000000000000002, 0.41000000000000003, 0.6203635888125583]
[0.21000000000000002, 0.6100000000000001, 0.5172744219710309]
[0.21000000000000002, 0.81, 0.473370647106586]
[0.41000000000000003, 0.01, 0.6203635888125583]
[0.41000000000000003, 0.21000000000000002, 0.6203635888125583]
[0.41000000000000003, 0.41000000000000003, 0.6203635888125583]
[0.41000000000000003, 0.6100000000000001, 0.6203635888125583]
[0.41000000000000003, 0.81, 0.6203635888125583]
[0.6100000000000001, 0.01, 0.6203635888125583]
[0.6100000000000001, 0.21000000000000002, 0.6203635888125583]
[0.6100000000000001, 0.41000000000000003, 0.6203635888125583]
[0.6100000000000001, 0.6100000000

In [37]:
kfoldcv(new_df1, 5, 0.01, 0.1)

0.510410542737499

In [38]:
kfoldcv(new_df1, 5, 0.03, 0.09)

0.5416835293294773

In [39]:
kfoldcv(new_df1, 5, 0.02, 0)

0.6203635888125583

In [40]:
kfoldcv(new_df1, 5, 0.04, 0.09)

0.6117077361457032

In [41]:
kfoldcv(new_df1, 5, 0.1, 0)

0.6203635888125583

We found that amongst all the values of alpha and beta that we tested, we got the highest accuracy (62.02 percent), for several combinations of alpha and beta. One of these combinations is: alpha = 0.01, beta = 0.01