In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import squarify
from sklearn import metrics
from sklearn import linear_model

#classifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv(r"/kaggle/input/netflix-original-films-imdb-scores/NetflixOriginals.csv")

In [None]:
df['Premiere'] = pd.to_datetime(df['Premiere'])
df['Year'] = df['Premiere'].apply(lambda x:x.year)
df['Month'] = df['Premiere'].apply(lambda x:x.month)
df['Week Day'] = df['Premiere'].apply(lambda x:x.dayofweek)

In [None]:
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Week Day'] = df['Week Day'].map(dmap)
df.head()

In [None]:
df.rename(columns={'IMDB Score':'Score'},inplace=True)
df.head()

In [None]:
# Find movies which are thriller and imdb score > 5
df[(df['Genre']=='Thriller')&(df['Score']>5)]

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset = ['Title', 'Genre', 'Premiere', 'Runtime', 'Score', 'Language', 'Year', 'Month', 'Week Day'], how = 'any').shape

In [None]:
df.columns

In [None]:
df.columns.to_series().groupby(df.dtypes).groups

In [None]:
round((df.apply(lambda x:x.isnull().sum())/len(df))*100,2)

In [None]:
#Checking for percentage of missing values in each columns
(df.isnull().sum()/len(df))*100

In [None]:
total_miss = df.isnull().sum()
perc_miss = total_miss/df.isnull().count()*100

missing_data = pd.DataFrame({'Total missing':total_miss,'% missing':perc_miss})

missing_data.sort_values(by='Total missing',ascending=False).head(3)

In [None]:
# find the unique values from categorical features
for col in df.select_dtypes(include='object').columns:
    print(col)
    print(df[col].unique())

In [None]:
numerical_data = df.select_dtypes(include=np.number) # select_dtypes selects data with numeric features
numerical_col = numerical_data.columns 

print("Numeric Features:")
print(numerical_data.head())
print("===="*20)

In [None]:
categorical_data = df.select_dtypes(exclude=np.number) # we will exclude data with numeric features
categorical_col = categorical_data.columns                          # we will store the categorical features in a variable

print("Categorical Features:")
print(categorical_data.head())
print("===="*20)

In [None]:
### numerical 
numerical_cols = list(df.select_dtypes(exclude=['object']))
numerical_cols

In [None]:
### categorical
categorical_cols = list(df.select_dtypes(include=['object']))
categorical_cols

In [None]:
# list of numerical variables
numerical_features = [feature for feature in df.columns if ((df[feature].dtypes != 'O') & (feature not in ['y']))]
print('Number of numerical variables: ', len(numerical_features))

In [None]:
#Discrete Numerical Features
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
#Continuous Numerical Features
continuous_features=[feature for feature in numerical_features if feature not in discrete_feature+['deposit']]
print("Continuous feature Count: {}".format(len(continuous_features)))

In [None]:
df.describe(include=object)

In [None]:
title_Series = df['Title']
title_levels = title_Series.unique()
title_levels

In [None]:
genre_Series = df['Genre']
genre_levels = genre_Series.unique()
genre_levels

In [None]:
language_Series = df['Language']
language_levels = language_Series.unique()
language_levels

In [None]:
weekday_Series = df['Week Day']
weekday_levels = weekday_Series.unique()
weekday_levels

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.hist(figsize=(20,20))
plt.show()

In [None]:
matrix = df.corr() 
f, ax = plt.subplots(figsize=(25, 12)) 
sns.heatmap(matrix, vmax=.8, square=True, cmap="RdYlGn",annot = True);

In [None]:
sns.pairplot(df)

In [None]:
def bar_plot(variable):
    var = df[variable]
    varValue = var.value_counts()
    plt.figure(figsize=(15,3))
    plt.bar(varValue.index, varValue,color=['#00008b','#00e5ee','#cd1076', '#008080','#cd5555','red','blue'])
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [None]:
categorical_cols = ['Language', 'Week Day']
for c in categorical_cols:
    bar_plot(c)

In [None]:
categorcial_variables = ['Language', 'Week Day']
for col in categorcial_variables:
    plt.figure(figsize=(20,7))
    sns.barplot(df[col].value_counts().values, df[col].value_counts().index)
    plt.title(col)
    plt.tight_layout()

In [None]:
df.columns

# Univariate Analysis

In [None]:
df['Title'].value_counts()

In [None]:
df['Genre'].value_counts()

In [None]:
df['Premiere'].value_counts()

In [None]:
df['Runtime'].value_counts()

In [None]:
df['Score'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y = df['Score'])

In [None]:
df['Language'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y = df['Language'])

In [None]:
plt.figure(figsize=(10,10))
df['Language'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Year'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y = df['Year'])

In [None]:
plt.figure(figsize=(10,10))
df['Year'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Month'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y = df['Month'])

In [None]:
plt.figure(figsize=(10,10))
df['Month'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Week Day'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y = df['Week Day'])

In [None]:
plt.figure(figsize=(10,10))
df['Week Day'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
week = df['Week Day'].value_counts()

plt.style.use('default')
plt.figure(figsize = (15, 7))
squarify.plot(sizes = week.values, label = week.index, value = week.values)
plt.title('Week Day Distribution', fontdict = {'fontname' : 'Monospace', 'fontsize' : 20, 'fontweight' : 'bold'})
plt.show()

In [None]:
month = df['Month'].value_counts()

plt.style.use('default')
plt.figure(figsize = (15, 7))
squarify.plot(sizes = month.values, label = month.index, value = month.values)
plt.title('Month Distribution', fontdict = {'fontname' : 'Monospace', 'fontsize' : 20, 'fontweight' : 'bold'})
plt.show()

In [None]:
year = df['Year'].value_counts()

plt.style.use('default')
plt.figure(figsize = (15, 7))
squarify.plot(sizes = year.values, label = year.index, value = year.values)
plt.title('Year Distribution', fontdict = {'fontname' : 'Monospace', 'fontsize' : 20, 'fontweight' : 'bold'})
plt.show()

In [None]:
df['Title'] = df['Title'].map({'Enter the Anime': 0, 'Dark Forces': 1, 'The App': 2, 'The Open House': 3, 'Kaali Khuhi': 4, 'Drive': 5, 
                               'Leyla Everlasting': 6, 'The Last Days of American Crime': 7, 'Paradox': 8, 'Sardar Ka Grandson': 9, 
                               'Searching for Sheela': 10, 'The Call': 11, 'Whipped': 12, 'All Because of You': 13, 'Mercy': 14, 
                               'After the Raid': 15, 'Ghost Stories': 16, 'The Last Thing He Wanted': 17, 'What Happened to Mr. Cha?': 18, 
                               'Death Note': 19, "Hello Privilege. It's Me, Chelsea": 20, 'Secret Obsession': 21, 'Sextuplets': 22, 
                               'The Girl on the Train': 23, 'Thunder Force': 24, 'Fatal Affair': 25, 'Just Say Yes': 26, 
                               'Seriously Single': 27, 'The Misadventures of Hedi and Cokeman': 28, '5 Star Christmas': 29, 
                               'After Maria': 30, 'I Am the Pretty Thing That Lives in the House': 31, 'Paris Is Us': 32, 
                               'Porta dos Fundos: The First Temptation of Christ': 33, 'Rattlesnake': 34, 'The Players': 35, 
                               'We Are One': 36, 'Finding Agnes': 37, 'IO': 38, 'Sentinelle': 39, 'Sol Levante': 40, 'The Binding': 41, 
                               'We Can Be Heroes': 42, 'Christmas Crossfire': 43, 'Coin Heist': 44, 'Mrs. Serial Killer': 45, 
                               'Nobody Sleeps in the Woods Tonight': 46, 'Take the 10': 47, 'The Main Event': 48, 'The Ridiculous 6': 49, 
                               'Earth and Blood': 50, 'Fearless': 51, 'Holiday Rush': 52, 'The Day of the Lord': 53, 'Airplane Mode': 54, 
                               'How It Ends': 55, 'Love Like the Falling Rain': 56, 'Rebirth': 57, 'Squared Love': 58, 'Cadaver': 59, 
                               'Clinical': 60, 'Coffee & Kareem': 61, 'Dude': 62, 'Geez & Ann': 63, 'The Larva Island Movie': 64, 
                               '#REALITYHIGH': 65, 'American Factory: A Conversation with the Obamas ': 66, 'Desperados': 67, 
                               "Dolly Parton's Christmas on the Square": 68, 'Father of the Year': 69, 'Firebrand': 70, 'Ghost Lab': 71, 
                               "Girlfriend's Day": 72, 'Handsome: A Netflix Mystery Movie': 73, 'Hubie Halloween': 74, 'Ibiza': 75, 
                               'Rim of the World': 76, 'Sandy Wexler': 77, 'See You Yesterday': 78, 
                               'Still Laugh-In: The Stars Celebrate': 79, 'Strip Down, Rise Up': 80, 'Tall Girl': 81, 'The Beast': 82, 
                               'The Week Of': 83, 'A Christmas Prince: The Royal Wedding': 84, 'Back to School': 85, 'Dangerous Lies': 86, 
                               'Gunjan Saxena: The Kargil Girl': 87, 'Intuition': 88, 'The Most Assassinated Woman in the World': 89, 
                               'Things Heard & Seen': 90, 'To Each, Her Own': 91, 'Who Would You Take to a Deserted Island?': 92, 
                               'XOXO': 93, "A Babysitter's Guide to Monster Hunting": 94, 'A Christmas Prince: The Royal Baby': 95, 
                               'Despite Everything': 96, 'Dolly Kitty and Those Twinkling Stars': 97, "Freaks: You're One of Us": 98, 
                               'Game Over, Man!': 99, 'Guilty': 100, 'In the Tall Grass': 101, 'Madame Claude': 102, 'Naked': 103, 
                               'Outside the Wire': 104, 'The Princess Switch: Switched Again': 105, 'Under the Riccione Sun': 106, 
                               'A Very Murray Christmas': 107, 'Been So Long': 108, 'Dead Kids': 109, 'Get the Grift': 110, 
                               'Ghosts of Sugar Land': 111, 'House Arrest': 112, "Kevin Hart's Guide to Black History": 113, 
                               'Love Wedding Repeat': 114, 'Mute': 115, '�l�t?r�': 116, 'Red Dot': 117, 'Ride or Die': 118, 
                               'Step Sisters': 119, 'The Cloverfield Paradox': 120, 'The Knight Before Christmas': 121, 
                               'The Legacy of a Whitetail Deer Hunter': 122, 'The Package': 123, 'Unicorn Store': 124, 
                               'Wine Country': 125, 'Bomb Scared': 126, 'Brahman Naman': 127, 'Double Dad': 128, 
                               'Falling Inn Love': 129, 'Hold the Dark': 130, 'Love, Guaranteed': 131, 'One-Way to Tomorrow': 132, 
                               "Sarah Cooper: Everything's Fine": 133, 'The Last Laugh': 134, 'The Last Paradiso': 135, 
                               'The Midnight Sky': 136, 'The Paramedic': 137, 'The Sleepover': 138, 'Vampires vs. the Bronx': 139, 
                               'Why Did You Kill Me?': 140, 'A Week Away': 141, 'Caught by a Wave': 142, 'Christmas Inheritance': 143, 
                               'Dad Wanted': 144, 'El Camino Christmas': 145, 'Eli': 146, 'Ginny Weds Sunny': 147, 'Good Sam': 148, 
                               'Lionheart': 149, 'Little Evil': 150, 'One Take': 151, 'Out of Many, One': 152, 'Point Blank': 153, 
                               'Prime Time': 154, 'The Do-Over': 155, 'The Holiday Calendar': 156, 'The Woman in the Window': 157, 
                               'The Wrong Missy': 158, 'Velvet Buzzsaw': 159, 'Yes Day': 160, '15 August': 161, 
                               'A California Christmas': 162, 'A Christmas Prince': 163, 'All Day and a Night': 164, 
                               'American Son': 165, 'Barry': 166, 'Candy Jar': 167, 'Choked: Paisa Bolta Hai': 168, "Class of '83": 169, 
                               'Extinction': 170, 'Happy Anniversary': 171, 'I Am All Girls': 172, 'Let It Snow': 173, 'Mascots': 174, 
                               'Operation Christmas Drop': 175, 'Rajma Chawal': 176, 'Rich in Love': 177, 'Rising High': 178, 
                               'Rodney King': 179, 'Sierra Burgess Is a Loser': 180, 'Small Crimes': 181, 'Special Correspondents': 182, 
                               'TAU': 183, 'The After Party': 184, 'The Babysitter: Killer Queen': 185, 'The Claus Family': 186, 
                               'The Kissing Booth 2': 187, 'The Perfect Date': 188, 'What We Wanted': 189, "You've Got This": 190, 
                               '6 Balloons': 191, 'A Fall from Grace': 192, 'Amateur': 193, 'Army of the Dead': 194, 'Cam': 195, 
                               'Earthquake Bird': 196, "Frankenstein's Monster's Monster, Frankenstein": 197, 'Horse Girl': 198, 
                               'Notes from Dunblane: Lesson from a School Shooting': 199, 'Maska': 200, 'The Decline': 201, 
                               'The Minimalists: Less Is Now': 202, 'The Polka King': 203, 'The Prom': 204, 
                               'True Memoirs of an International Assassin': 205, 'Ultras': 206, 'Come Sunday': 207, 
                               'Forgive Us Our Debts': 208, 'iBoy': 209, 'Lovefucked': 210, 'Juanita': 211, 'Murder Mystery': 212, 
                               'Project Power': 213, 'Rebecca': 214, 'The Christmas Chronicles: Part Two': 215, 
                               'The Kissing Booth': 216, 'The Princess Switch': 217, 'To All the Boys: P.S. I Still Love You': 218, 
                               'War Machine': 219, '6 Underground': 220, 'Between Two Ferns: The Movie': 221, 'Burning Sands': 222, 
                               'Casting JonBenet': 223, 'Deidra & Laney Rob a Train': 224, "Finding 'Ohana": 225, 'Holidate': 226, 
                               'Holiday in the Wild': 227, 'Hot Girls Wanted': 228, 'Like Father': 229, 'Lost Girls': 230, 
                               'Otherhood': 231, "Pee-wee's Big Holiday": 232, 'Rogue City': 233, 'Sergio': 234, 'Stuck Apart': 235, 
                               'Tersanjung the Movie': 236, 'The Killer': 237, 'The Lovebirds': 238, 
                               'The Most Hated Woman in America': 239, 'The Perfection': 240, 'Tribhanga � Tedhi Medhi Crazy': 241, 
                               'Unknown Origins': 242, 'Work It': 243, 'Alien Xmas': 244, 'Baggio: The Divine Ponytail': 245, 
                               'Below Zero': 246, 'Citation': 247, 'Crazy Awesome Teachers': 248, 'Have You Ever Seen Fireflies?': 249, 
                               'High Flying Bird': 250, 'In the Shadow of the Moon': 251, 'Lost Bullet': 252, 
                               'Octonauts & the Caves of Sac Actun': 253, 'Offering to the Storm': 254, 'Roxanne Roxanne': 255, 
                               'Someone Great': 256, 'Spenser Confidential': 257, 'The Land of Steady Habits': 258, 
                               'The Rachel Divide': 259, 'Voyuer ': 260, 'Win It All': 261, '1922': 262, 'A Tale of Two Kitchens': 263, 
                               'Alex Strangelove': 264, 'Apostle': 265, 'Benji': 266, 'Bright': 267, 'Cargo': 268, 
                               'Concrete Cowboy': 269, 'Feel the Beat': 270, 'Get the Goat': 271, 'I Am Not an Easy Man': 272, 
                               'June & Kopi': 273, 'Music Teacher': 274, 'Nail Bomber: Manhunt': 275, 'Notes for My Son': 276, 
                               'Polar': 277, 'Porta dos Fundos: The Last Hangover': 278, 'Sand Castle': 279, 'Shimmer Lake': 280, 
                               'Spectral': 281, 'The Babysitter': 282, 'The Discovery': 283, 'The Forest of Love': 284, 
                               'The Laundromat': 285, 'The Legend of Cocaine Island': 286, 'The Outsider': 287, 'Time to Hunt': 288, 
                               'To All the Boys: Always and Forever': 290, 'Travis Scott: Look Mom I Can Fly': 291, 'Uncorked': 292, 
                               'Anelka: Misunderstood': 293, 'Ariana Grande: Excuse Me, I Love You': 294, 'ARQ': 295, 'Birders': 296,
                               'Como Ca�do del Cielo' 'First Match': 297, 'Fractured': 298, 'Irreplaceable You': 299, 'Isi & Ossi': 300, 
                               'John Was Trying to Contact Aliens': 301, 'Layla Majnun': 302, 
                               'Murder to Mercy: The Cyntoia Brown Story': 303, 'My Own Man': 304, 'Nappily Ever After': 305, 
                               'Over the Moon': 306, 'Street Flow': 307, 'Strong Island': 308, 
                               'Sturgill Simpson Presents: Sound & Fury': 309, 'Take Your Pills': 310, 'The Heartbreak Club': 311, 
                               'The Mars Generation': 312, 'The Occupant': 313, 'The Willoughbys': 314, 'Triple Frontier': 315, 
                               'Two Catalonias': 316, 'Walk. Ride. Rodeo.': 317, 'Wheelman': 318, 'When We First Met': 319, 
                               'A 3 Minute Hug': 320, 'All the Bright Places': 321, 'All Together Now': 322, 
                               'Altered Carbon: Resleeved': 323, 'Antoine Griezmann: The Making of a Legend': 324, 'Canvas ': 325, 
                               'Chadwick Boseman: Portrait of an Artist': 326, 'Chopsticks': 327, 'Da 5 Bloods': 328, 
                               'Dolly Parton: A MusiCares Tribute': 329, 'Eurovision Song Contest: The Story of Fire Saga': 330, 
                               "Gerald's Game": 331, 'His House': 332, 'Jingle Jangle: A Christmas Journey': 333, 
                               'Life Overtakes Me': 334, 'Lust Stories': 335, 'Monster': 336, 'Mowgli: Legend of the Jungle': 337, 
                               "Nobody Knows I'm Here": 338, 'Nobody Speak: Trials of the Free Press': 339, 'Oxygen': 340, 
                               'Set It Up': 341, 'The Incredible Jessica James': 342, 'Tigertail': 343, 'Tramps': 344, 
                               'What Did Jack Do?': 345, 'Bad Trip': 346, 'Bird Box': 347, 'Bulbbul': 348, 'Crazy About Her': 349, 
                               'Elisa & Marcela': 350, "I'll Sleep When I'm Dead": 351, "I'm Thinking of Ending Things": 352, 
                               'It Takes a Lunatic': 353, 'Milestone': 354, 'Recovery Boys': 355, 
                               'ReMastered: Who Killed Jam Master Jay?': 356, 'Shawn Mendes: In Wonder': 357, 'Space Sweepers': 358, 
                               'The American Meme': 359, 'The Angel': 360, 'The Crimes That Bind': 361, 'The Red Sea Diving Resort': 362, 
                               'What Would Sophia Loren Do?': 363, 'A Whisker Away': 364, 'Ajeeb Daastaans': 365, 
                               'Arlo the Alligator Boy': 366, 'Bikram: Yogi, Guru, Predator': 367, 'Blame!': 368, 'Blue Miracle': 369, 
                               'CounterPunch ': 370, 'Crack: Cocaine, Corruption & Conspiracy': 371, 'Extraction': 372, 
                               'Giving Voice': 373, 'Hillbilly Elegy': 374, 'Hope Frozen: A Quest to Live Twice': 375, 
                               'Imperial Dreams': 376, 'Just Another Christmas': 377, 'Little Miss Sumo': 378, 
                               'Malcolm & Marie': 379, "Michael Bolton's Big, Sexy, Valentine's Day Special": 380, 'Moxie': 381, 
                               'Night in Paradise': 382, 'Paper Lives': 383, 'Parchis: The Documentary': 384, 'Tallulah': 385, 
                               'The Old Guard': 386, 'Tony Robbins: I Am Not Your Guru': 387, 'Upstarts': 388, '22 July': 389, 
                               '7 a�os': 390, 'A Futile and Stupid Gesture': 391, 'A Life of Speed: The Juan Manuel Fangio Story': 392, 
                               'A Love Song for Latasha': 393, 'All in My Family': 394, 'Always Be My Maybe': 395, 'Becoming': 396, 
                               'Long Live Brij Mohan': 397, 'Calibre': 398, 'Death to 2020': 399, 'GIMS: On the Record': 400, 
                               'Have a Good Trip: Adventures in Psychedelics': 401, 'Heroin(e) ': 402, 'Mercury 13': 404, 
                               'Saving Capitalism': 405, 'Serious Men': 406, 'The Boys in the Band': 407, 
                               'The Boys in the Band: Something Personal': 408, 'The Life Ahead': 409, 
                               'The Other Side of the Wind': 410, 'The Trader': 411, 'To the Bone': 412, 
                               'Tony Parker: The Final Shot': 413, 'AK vs AK': 414, 'Amanda Knox': 415, 
                               'Bigflo & Oil: Hip Hop Frenzy': 414, 'Biggie: I Got a Story to Tell': 415, 'Cops and Robbers': 416, 
                               "I Don't Feel at Home in This World Anymore": 417, 'Laerte-se': 418, 'Mank': 419, 
                               'Our Souls at Night': 420, 'Outlaw King': 421, 'Pagglait': 422, 
                               'ReMastered: Who Shot the Sheriff?': 423, 'Seeing Allred': 424, 'Spelling the Dream': 425, 
                               'The Claudia Kishi Club': 426, 'The Half of It': 427, 'The Highwaymen': 428, 
                               'The Lonely Island Presents: The Unauthorized Bash Brothers Experience': 429, 
                               'The Meyerowitz Stories (New and Selected)': 430, 'Feminists: What Were They Thinking?': 431, 
                               'Gaga: Five Foot Two': 432, 
                               "I'm No Longer Here: A Discussion with Guillermo del Toro and Alfonso Cuaron":433, 
                               'Kingdom of Us': 434, 'Lorena, Light-Footed Woman': 435, 
                               'Los Tigres del Norte at Folsom Prison': 436, "Ma Rainey's Black Bottom": 437, 
                               "Ma Rainey's Black Bottom: A Legacy Brought to Screen": 438, 
                               'Operation Varsity Blues: The College Admissions Scandal': 439, 'Pele': 440, 
                               'ReMastered: Devil at the Crossroads': 441, "ReMastered: The Lion's Share": 442, 
                               'ReMastered: The Miami Showband Massacre': 443, 'Resurface': 444, 
                               "Rocko's Modern Life: Static Cling": 445, 'Rose Island': 446, 'The Christmas Chronicles': 447, 
                               'The Dirt': 448, 'The Night Comes for Us': 449, 
                               '13th: A Conversation with Oprah Winfrey & Ava DuVernay': 450, "Angela's Christmas": 451, 
                               "Angela's Christmas Wish": 452, 'Beats': 453, 'Circus of Books': 454, 
                               'Dance Dreams: Hot Chocolate Nutcracker': 455, 'Derren Brown: Sacrifice': 456, 
                               'El Pepe: A Supreme Life': 457, 'End Game': 458, 'Evelyn': 459, 'Ferry': 460, 'Grass Is Greener': 461, 
                               'Guillermo Vilas: Settling the Score': 462, 'Joshua: Teenager vs. Superpower': 463, 
                               'Keith Richards: Under the Influence': 464, 'Knock Down the House': 465, 
                               'Loudon Wainwright III: Surviving Twin': 466, 'My Beautiful Broken Brain': 467, 'One of Us': 468, 
                               'Pieces of a Woman': 469, 'Ram Dass, Going Home': 470, 'ReMastered: Tricky Dick & the Man in Black': 471, 
                               'Rooting for Roona': 472, 'The Devil All the Time': 473, 'The Dig': 474, 'The Great Hack': 475, 
                               'The White Tiger': 476, "To All the Boys I've Loved Before": 477, 
                               'American Murder: The Family Next Door': 478, 'Audrie & Daisy': 479, 'First They Killed My Father': 480, 
                               'Fyre: The Greatest Party That Never Happened': 481, 'Into the Inferno': 482, 'LA Originals': 483, 
                               'Ladies First': 484, 'Love per Square Foot': 485, 'Paddleton': 486, 'Private Life': 487, 'Seventeen': 488, 
                               'Sometimes': 489, 'Soni': 490, 'The 40-Year-Old Version': 491, 'The Disciple': 492, 
                               'The Edge of Democracy': 493, 'The King': 494, 'The Road to El Camino: A Breaking Bad Movie': 495, 
                               'The Siege of Jadotville': 496, 'Zion ': 497, 'Dolemite Is My Name': 498, 
                               'El Camino: A Breaking Bad Movie': 499, 'Extremis': 500, 'Father Soldier Son': 501, 
                               'Get Me Roger Stone': 502, "I'm No Longer Here": 503, 
                               'Mucho Mucho Amor: The Legend of Walter Mercado ': 504, 'Octonauts & the Great Barrier Reef': 505, 
                               'Okja': 506, 'On My Skin': 507, 'Raat Akeli Hai': 508, 'ReMastered: Massacre at the Stadium': 509, 
                               'ReMastered: The Two Killings of Sam Cooke': 510, 'Secrets of the Saqqara Tomb': 511, 
                               'Sitara: Let Girls Dream': 512, 'Sky Ladder: The Art of Cai Guo-Qiang': 513, 'Team Foxcatcher': 514, 
                               'The Ballad of Buster Scruggs': 515, 'The Death and Life of Marsha P. Johnson': 516, 
                               'The Fundamentals of Caring': 517, 'The Other One: The Long Strange Trip of Bob Weir': 518, 
                               'American Factory': 519, 'Fire in Paradise': 520, 'Long Shot': 521, 'Miss Americana': 522, 
                               'Period. End of Sentence.': 523, 'Shawn Mendes: Live in Concert': 524, 'Shirkers': 525, 
                               'The Black Godfather': 526, 'The Irishman: In Conversation': 527, 'The Speed Cubers': 528, 
                               "They'll Love Me When I'm Dead": 529, 'Tig' "Barbra: The Music, The Mem'ries, The Magic!": 530, 
                               'Blackpink: Light Up the Sky': 531, 'City of Joy': 532, 'Dick Johnson Is Dead': 533, 
                               'Homecoming: A Film by Beyonce ': 534, 'Invader Zim: Enter the Florpus': 535, 
                               'Joan Didion: The Center Will Not Hold': 536, 'John Mulaney & the Sack Lunch Bunch': 537, 
                               'Reversing Roe': 538, 'The White Helmets': 539, 'Athlete A': 540, 'Ludo': 541, 'Quincy': 542, 
                               'Rolling Thunder Revue: A bob Dylan Story by Martin Scorsere': 543, 'Tell Me Who I Am': 544, 
                               'The Bleeding Edge': 545, 'The Social Dilemma': 546, 'The Two Popes': 547, 
                               'What Happened, Miss Simone?': 548, 'Yeh Ballet': 549, 'Anima': 550, 'Beasts of No Nation': 551, 
                               'Brene Brown: The Call to Courage': 552, 'Crip Camp: A Disability Revolution': 553, 
                    'Jim & Andy: The Great Beyond - Featuring a Very Special, Contractually Obligated Mention of Tony Cliffton ': 554, 
                               'Justin Timberlake + The Tennessee Kids': 555, 'Road to Roma': 556, 'Roma': 557, 
                               'If Anything Happens I Love You': 558, 'The Irishman': 559, 'The Trial of the Chicago 7': 560, 
                               'A Secret Love': 561, 'Icarus': 562, 'Marriage Story': 563, 'The Ivory Game': 564, 
                               'Struggle: The Life and Lost Art of Szukaiski': 565, 'Chasing Coral ': 566, 'My Octopus Teacher': 567, 
                               'Rising Phoenix': 568, '13th': 569, 'Disclosure: Trans Lives on Screen': 570, 'Klaus': 571, 
                               'Seaspiracy': 572, 'The Three Deaths of Marisela Escobedo': 573, 'Cuba and the Cameraman ': 574, 
                               'Dancing with the Birds':575, 'Ben Platt: Live from Radio City Music Hall': 576, 
                               'Taylor Swift: Reputation Stadium Tour': 577, "Winter on Fire: Ukraine's Fight for Freedom": 578, 
                               'Springsteen on Broadway': 579, "Emicida: AmarElo - It's All For Yesterday": 580, 
                               'David Attenborough: A Life on Our Planet': 581})
df.head()

In [None]:
df['Genre'] = df['Genre'].map({'Documentary': 0, 'Thriller': 1, 'Science fiction/Drama': 2, 'Horror thriller': 3, 'Mystery': 4, 
                              'Action': 5, 'Comedy': 6, 'Heist film/Thriller': 7, 'Musical/Western/Fantasy': 8, 'Drama': 9, 
                              'Romantic comedy': 11, 'Action comedy': 12, 'Horror anthology': 13, 'Political thriller': 14, 
                              'Superhero-Comedy': 15, 'Horror': 16, 'Romance drama': 17, 'Anime / Short': 18, 'Superhero': 19, 
                              'Heist': 20, 'Western': 21, 'Animation/Superhero': 22, 'Family film': 23, 'Action-thriller': 24, 
                              'Teen comedy-drama': 25, 'Romantic drama': 26, 'Animation': 27, 'Aftershow / Interview': 28, 
                              'Christmas musical': 29, 'Science fiction adventure': 30, 'Science fiction': 31, 'Variety show': 32, 
                              'Comedy-drama': 33, 'Comedy/Fantasy/Family': 34, 'Supernatural drama': 35,  'Action/Comedy': 36, 
                              'Action/Science fiction': 37, 'Romantic teenage drama': 38, 'Comedy / Musical': 39, 'Musical': 40, 
                              'Science fiction/Mystery': 41, 'Crime drama': 42, 'Psychological thriller drama': 43, 
                              'Adventure/Comedy': 44,  'Black comedy': 45, 'Romance': 46, 'Horror comedy': 47, 'Christian musical': 48, 
                              'Romantic teen drama': 49, 'Family': 50, 'Dark comedy': 51, 'Comedy horror': 52, 
                              'Psychological thriller': 53, 'Biopic': 54, 'Science fiction/Thriller': 55, 'Mockumentary': 56, 
                              'Satire': 57, 'One-man show': 58, 'Romantic comedy-drama': 59, 'Comedy/Horror': 60, 'Fantasy': 61, 
                              'Sports-drama': 62, 'Zombie/Heist': 63, 'Psychological horror': 64, 'Sports film': 65, 
                              'Comedy mystery': 66, 'Romantic thriller': 67, 'Christmas comedy': 68, 'War-Comedy': 69, 
                              'Romantic comedy/Holiday': 70, 'Adventure-romance': 71, 'Adventure': 72, 'Horror-thriller': 73, 
                              'Dance comedy': 74, 'Stop Motion': 75, 'Horror/Crime drama': 76, 'Urban fantasy': 77, 
                              'Drama/Horror': 78, 'Family/Comedy-drama': 79, 'War': 80, 'Crime thriller': 81, 
                              'Science fiction/Actio n': 82, 'Teen comedy horror': 83, 'Concert Film': 84, 'Musical comedy': 85, 
                              'Animation/Musical/Adventure': 86, 'Animation / Musicial': 87, 'Animation/Comedy/Adventure': 88, 
                              'Action thriller': 89, 'Anime/Science fiction': 90, 'Animation / Short': 91, 'War drama': 92, 
                              'Family/Christmas musical': 93, 'Science fiction thriller': 94, 'Drama / Short': 95, 
                              'Hidden-camera prank comedy': 96, 'Spy thriller': 97, 'Anime/Fantasy': 98, 
                              'Animated musical comedy': 99, 'Variety Show': 100, 'Superhero/Action': 101, 
                              'Biographical/Comedy': 102, 'Historical-epic': 103, 'Animation / Comedy': 104, 
                              'Christmas/Fantasy/Adventure/Comedy': 105, 'Mentalism special': 106, 'Drama-Comedy': 107, 
                              'Coming-of-age comedy-drama': 108, 'Historical drama': 109, 'Making-of': 110, 'Action-adventure': 111, 
                              'Animation / Science Fiction': 112, 'Anthology/Dark comedy': 113, 'Musical / Short': 114, 
                              'Animation/Christmas/Comedy/Adventure': 115})
df.head()

In [None]:
df['Language'] = df['Language'].map({'English/Japanese': 0, 'Spanish': 1, 'Italian': 2, 'English': 3, 'Hindi': 4, 'Turkish': 5, 
                                     'Korean': 6, 'Indonesian': 7, 'Malay': 8, 'Dutch': 9, 'French': 10, 'English/Spanish': 11, 
                                     'Portuguese': 12, 'Filipino': 13, 'German': 14, 'Polish': 15, 'Norwegian': 16, 'Marathi': 17, 
                                     'Thai': 18, 'Swedish': 19, 'Japanese': 20, 'Spanish/Basque': 21, 'Spanish/Catalan': 22, 
                                     'English/Swedish': 23, 'English/Taiwanese/Mandarin': 24, 'Thia/English': 25, 
                                     'English/Mandarin': 26, 'Georgian': 27, 'Bengali': 28, 'Khmer/English/French': 29, 
                                     'English/Hindi': 30, 'Tamil': 31, 'Spanish/English': 32, 'English/Korean': 33, 
                                     'English/Arabic': 34, 'English/Russian': 35, 'English/Akan': 36, 'English/Ukranian/Russian': 37})
df.head()

In [None]:
df['Week Day'] = df['Week Day'].map({'Mon': 0, 'Fri': 1, 'Thu': 2, 'Tue': 3, 'Wed': 4, 'Sun': 5, 'Sat':6})
df.head()

# Dist Plot

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(x = df['Month'])

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(x = df['Year'])

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(x = df['Score'])

# Bar Plot

In [None]:
plt.figure(figsize = (10, 7))
sns.barplot(x = 'Year', y = 'Month', hue = 'Week Day', data = df)

In [None]:
plt.figure(figsize = (20,7))
sns.barplot(x = 'Score', y = 'Language', data = df)

In [None]:
plt.figure(figsize = (20,7))
sns.barplot(x = 'Year', y = 'Language', data = df)

In [None]:
plt.figure(figsize = (20,7))
sns.barplot(x = 'Year', y = 'Week Day', hue = 'Month',data = df)

# Box Plot

In [None]:
plt.figure(figsize = (20,7))
sns.boxplot(x = 'Score', y = 'Language', data = df)

In [None]:
plt.figure(figsize = (20,7))
sns.boxplot(x = 'Year', y = 'Month', hue = 'Week Day',data = df)

In [None]:
plt.figure(figsize = (20,7))
sns.boxplot(x = 'Year', y = 'Month',data = df)

# Count Plot

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x = df['Year'],hue = df['Week Day'])

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x = df['Year'],hue = df['Month'])

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x = df['Month'],hue = df['Week Day'])

In [None]:
df = df.dropna()
#print(df)
df.head()

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

In [None]:
df.columns

In [None]:
#Classifiers
X = df.loc[:, ['Title', 'Genre', 'Runtime', 'Language', 'Year', 'Month', 'Week Day']]
X.head()

In [None]:
Y = df.loc[:, ['Score']]
Y.head()

In [None]:
X = df.iloc[:, 0].values.reshape(-1, 1)
Y = df.iloc[:, 1].values.reshape(-1, 1)

In [None]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size= 0.25, random_state=0) 

In [None]:
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test)       

# Linear Regression

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train, Y_train) #training the algorithm
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)

In [None]:
Y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

In [None]:
# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(X, Y)
# Predict
Y_pred = regression_model.predict(X)

# model evaluation
rmse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

# printing values
print('Slope:' ,regression_model.coef_)
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
import statsmodels.api as sm

X = np.random.rand(100)
Y = X + np.random.rand(100)*0.1

results = sm.OLS(Y,sm.add_constant(X)).fit()

print(results.summary())

plt.scatter(X,Y)

X_plot = np.linspace(0,1,100)
plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

plt.show()

In [None]:
from sklearn.svm import SVC # "Support vector classifier"  
classifier = SVC(kernel='linear', random_state=0)  
classifier.fit(X_train, Y_train) 

In [None]:
#Predicting the test set result  
Y_pred= classifier.predict(X_test)  

In [None]:
cm= confusion_matrix(Y_test, Y_pred)

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
# create and fit DecisionTreeClassifier model
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)

In [None]:
#predict
Y_pred = dtc.predict(X_test)
Y_pred

In [None]:
dtc_acc = accuracy_score(Y_test, Y_pred)
print('The accuracy score with using the decision tree classifier is :',dtc_acc)

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
result = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(Y_test, Y_pred)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(Y_test, Y_pred)
print("Accuracy:",result2)