# Reading and Combining DataFrames

In [1]:
import pandas as pd
import numpy as np
import gzip
# Plot Libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Downloaded movie datasets from https://www.imdb.com/interfaces/

## TitleRatings: use as TRatings

In [3]:
TRatings=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.ratings.tsv.gz',compression='gzip',sep='\t')
TRatings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1696
1,tt0000002,6.0,210
2,tt0000003,6.5,1443
3,tt0000004,6.1,122
4,tt0000005,6.1,2243


In [4]:
TRatings.to_csv('D:/My files/Projects/Project Classifier MRS/TRatings.csv')

## TitleBasics: use as TBasics

In [5]:
TBasics=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.basics.tsv.gz',compression='gzip',sep='\t')
TBasics.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
TBasics.to_csv('D:/My files/Projects/Project Classifier MRS/TBasics.csv')

## TitleAkas: use as TAkas

In [7]:
TAkas=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.akas.tsv.gz',compression='gzip',sep='\t')
TAkas.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [8]:
TAkas=TAkas[['titleId','region','language']]
TAkas['region']=TAkas['region'].replace("\\N",np.nan)
TAkas['language']=TAkas['language'].replace("\\N",np.nan)
TAkas['region'] =TAkas['region'] .fillna('')
TAkas['language'] =TAkas['language'] .fillna('')
print("No of Duplicate Values:", TAkas.duplicated().sum())
TAkas.drop_duplicates(inplace=True)

No of Duplicate Values: 291414


In [9]:
TAkas = TAkas.groupby(['titleId']).agg(" ".join).reset_index()
TAkas.drop_duplicates(inplace=True)
TAkas.head()

Unnamed: 0,titleId,region,language
0,tt0000001,UA DE HU GR RU US JP,ja
1,tt0000002,HU FR DE RO RU US JP,ja
2,tt0000003,RO HU JP UA RU GB FR DE,ja
3,tt0000004,DE RO FR RU JP HU,ja
4,tt0000005,GB US UA RU DE HU,


In [10]:
TAkas=TAkas.rename(columns={"titleId":"tconst"})
TAkas.head()

Unnamed: 0,tconst,region,language
0,tt0000001,UA DE HU GR RU US JP,ja
1,tt0000002,HU FR DE RO RU US JP,ja
2,tt0000003,RO HU JP UA RU GB FR DE,ja
3,tt0000004,DE RO FR RU JP HU,ja
4,tt0000005,GB US UA RU DE HU,


In [11]:
TAkas.to_csv('D:/My files/Projects/Project Classifier MRS/TAkas.csv')

## TitleEpisode: use as TvTitles

In [12]:
TEpisode=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.episode.tsv.gz',compression='gzip',sep='\t')
TEpisode.head()

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16


#### Extracting Series Title for every episode of Series using TEpisode and TBasics

In [13]:
TEpisode=TEpisode.rename(columns={"tconst": "a", "parentTconst": "tconst"})
TEpisode.head()

Unnamed: 0,a,tconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16


In [14]:
TvTitles=pd.merge(TEpisode,TBasics,on='tconst',how='left')
TvTitles=TvTitles.rename(columns={"tconst":"parentTconst","a": "tconst"})
TvTitles=TvTitles[['tconst','seasonNumber','episodeNumber','primaryTitle']]
TvTitles=TvTitles.rename(columns={"primaryTitle":"SeriesTitle"})
TvTitles.head()

Unnamed: 0,tconst,seasonNumber,episodeNumber,SeriesTitle
0,tt0041951,1,9,The Lone Ranger
1,tt0042816,1,17,BBC Sunday-Night Theatre
2,tt0042889,\N,\N,BBC Sunday-Night Theatre
3,tt0043426,3,42,Studio One in Hollywood
4,tt0043631,2,16,BBC Sunday-Night Theatre


In [15]:
TvTitles=TvTitles.applymap(str)
TvTitles['SeriesInfo']=TvTitles['tconst']
TvTitles['SeriesInfo']=TvTitles['SeriesTitle']+" S"+TvTitles['seasonNumber']+" E"+TvTitles['episodeNumber']
TvTitles.loc[TvTitles['SeriesInfo'].str.contains(r'\\N'), 'SeriesInfo'] = TvTitles.SeriesTitle

In [16]:
TvTitles=TvTitles[['tconst','SeriesInfo']]
TvTitles.head()

Unnamed: 0,tconst,SeriesInfo
0,tt0041951,The Lone Ranger S1 E9
1,tt0042816,BBC Sunday-Night Theatre S1 E17
2,tt0042889,BBC Sunday-Night Theatre
3,tt0043426,Studio One in Hollywood S3 E42
4,tt0043631,BBC Sunday-Night Theatre S2 E16


In [17]:
TvTitles.to_csv('D:/My files/Projects/Project Classifier MRS/TvTitles.csv')

## NBasics: usage for naming nconsts

In [2]:
NBasics=pd.read_csv('D:/My files/Projects/Project Classifier MRS/name.basics.tsv.gz',compression='gzip',sep='\t')
NBasics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0072308,tt0053137,tt0031983"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0071877,tt0117057,tt0037382,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0054452,tt0049189,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0077975,tt0072562,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0050976,tt0050986,tt0060827"


In [3]:
NBasics = NBasics[['nconst','primaryName']]

## TitlePrincipals: use as TPrincipalNames

In [4]:
TPrincipals=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.principals.tsv.gz',compression='gzip',sep='\t')
TPrincipals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [5]:
TPrincipals = TPrincipals[['tconst','nconst']]

#### Giving nconsts Names

In [6]:
TPrincipalNames=pd.merge(TPrincipals,NBasics,on='nconst',how='left')
TPrincipalNames.head()

Unnamed: 0,tconst,nconst,primaryName
0,tt0000001,nm1588970,Carmencita
1,tt0000001,nm0005690,William K.L. Dickson
2,tt0000001,nm0374658,William Heise
3,tt0000002,nm0721526,Émile Reynaud
4,tt0000002,nm1335271,Gaston Paulin


In [7]:
TPrincipalNames=TPrincipalNames[['tconst','primaryName']]
TPrincipalNames['primaryName']=TPrincipalNames['primaryName'].replace("\\N",np.nan)
TPrincipalNames['primaryName']=TPrincipalNames['primaryName'] .fillna('')
print("No of Duplicate Values:", TPrincipalNames.duplicated().sum())
TPrincipalNames.drop_duplicates(inplace=True)
TPrincipalNames=TPrincipalNames.groupby(['tconst']).agg(" ".join).reset_index()
TPrincipalNames.drop_duplicates(inplace=True)
TPrincipalNames=TPrincipalNames.rename(columns={"primaryName":"CastName"})
TPrincipalNames.head()

No of Duplicate Values: 41321


Unnamed: 0,tconst,CastName
0,tt0000001,Carmencita William K.L. Dickson William Heise
1,tt0000002,Émile Reynaud Gaston Paulin
2,tt0000003,Émile Reynaud Julien Pappé Gaston Paulin Tamar...
3,tt0000004,Émile Reynaud Gaston Paulin
4,tt0000005,Charles Kayser John Ott William K.L. Dickson T...


In [8]:
TPrincipalNames.to_csv('D:/My files/Projects/Project Classifier MRS/TPrincipalNames.csv')

## TitleCrew: use as TCrewNames

In [9]:
TCrew=pd.read_csv('D:/My files/Projects/Project Classifier MRS/title.crew.tsv.gz',compression='gzip',sep='\t')
TCrew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [10]:
TCrew = TCrew[['tconst','directors']]
TCrew['directors']=TCrew['directors'].replace("\\N",np.nan)
TCrew = TCrew[TCrew['directors'].notna()]

In [11]:
def splitDataFrameList(df,target_column,separator):
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

TCrew1=splitDataFrameList(TCrew,'directors',',')

In [12]:
TCrew1=TCrew1.rename(columns={"directors":"nconst"})
TCrewNames=pd.merge(TCrew1,NBasics,on='nconst',how='left')
TCrewNames=TCrewNames.rename(columns={"primaryName":"DirectorName"})
TCrewNames=TCrewNames[['tconst','DirectorName']]

In [13]:
TCrewNames['DirectorName']=TCrewNames['DirectorName'].replace("\\N",np.nan)
TCrewNames['DirectorName']=TCrewNames['DirectorName'] .fillna('')
print("No of Duplicate Values:", TCrewNames.duplicated().sum())
TCrewNames.drop_duplicates(inplace=True)
TCrewNames=TCrewNames.groupby(['tconst']).agg(" ".join).reset_index()
TCrewNames.drop_duplicates(inplace=True)
TCrewNames

No of Duplicate Values: 3361


Unnamed: 0,tconst,DirectorName
0,tt0000001,William K.L. Dickson
1,tt0000002,Émile Reynaud
2,tt0000003,Émile Reynaud
3,tt0000004,Émile Reynaud
4,tt0000005,William K.L. Dickson
...,...,...
4469586,tt9916848,Semih Bagci Deniz Yorulmazer
4469587,tt9916850,Semih Bagci Deniz Yorulmazer
4469588,tt9916852,Semih Bagci Deniz Yorulmazer
4469589,tt9916856,Johan Planefeldt


In [14]:
TCrewNames.to_csv('D:/My files/Projects/Project Classifier MRS/TCrewNames.csv')

## Combining tables

In [2]:
TRatings=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TRatings.csv')
TBasics=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TBasics.csv')
TCrewNames=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TCrewNames.csv')
TPrincipalNames=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TPrincipalNames.csv')
TAkas=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TAkas.csv')
TvTitles=pd.read_csv('D:/My files/Projects/Project Classifier MRS/TvTitles.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
combinedTable1=pd.merge(TRatings,TBasics,on='tconst',how='outer')
combinedTable2=pd.merge(TCrewNames,TPrincipalNames,on='tconst',how='outer')
combinedTable3=pd.merge(TAkas,TvTitles,on='tconst',how='outer')
combinedTable4=pd.merge(combinedTable1,combinedTable2,on='tconst',how='outer')

In [4]:
combinedTable=pd.merge(combinedTable3,combinedTable4,on='tconst',how='outer')

In [5]:
combinedTable.to_csv('D:/My files/Projects/Project Classifier MRS/combinedTable.csv')

### Read combined data

In [3]:
rawData=pd.read_csv('D:/My files/Projects/Project Classifier MRS/combinedTable.csv')
rawData.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7785629 entries, 0 to 7785628
Data columns (total 23 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Unnamed: 0      int64  
 1   Unnamed: 0_x    float64
 2   tconst          object 
 3   region          object 
 4   language        object 
 5   Unnamed: 0_y    float64
 6   SeriesInfo      object 
 7   Unnamed: 0_x_x  float64
 8   averageRating   float64
 9   numVotes        float64
 10  Unnamed: 0_y_x  float64
 11  titleType       object 
 12  primaryTitle    object 
 13  originalTitle   object 
 14  isAdult         object 
 15  startYear       object 
 16  endYear         object 
 17  runtimeMinutes  object 
 18  genres          object 
 19  Unnamed: 0_x_y  float64
 20  DirectorName    object 
 21  Unnamed: 0_y_y  float64
 22  CastName        object 
dtypes: float64(8), int64(1), object(14)
memory usage: 1.3+ GB


In [4]:
data=rawData.copy()

In [5]:
data=data.drop(['Unnamed: 0','Unnamed: 0_x','Unnamed: 0_y','Unnamed: 0_x_x','Unnamed: 0_y_x','Unnamed: 0_x_y',
                'Unnamed: 0_y_y','originalTitle'],axis=1)
data

Unnamed: 0,tconst,region,language,SeriesInfo,averageRating,numVotes,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres,DirectorName,CastName
0,tt0000001,UA DE HU GR RU US JP,ja,,5.6,1696.0,short,Carmencita,0,1894,\N,1,"Documentary,Short",William K.L. Dickson,Carmencita William K.L. Dickson William Heise
1,tt0000002,HU FR DE RO RU US JP,ja,,6.0,210.0,short,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
2,tt0000003,RO HU JP UA RU GB FR DE,ja,,6.5,1443.0,short,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",Émile Reynaud,Émile Reynaud Julien Pappé Gaston Paulin Tamar...
3,tt0000004,DE RO FR RU JP HU,ja,,6.1,122.0,short,Un bon bock,0,1892,\N,12,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
4,tt0000005,GB US UA RU DE HU,,,6.1,2243.0,short,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",William K.L. Dickson,Charles Kayser John Ott William K.L. Dickson T...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7785624,tt9230980,,,,,,,,,,,,,,Daniel Küblböck
7785625,tt9230982,,,,,,,,,,,,,,Daniel Küblböck
7785626,tt9230984,,,,,,,,,,,,,,Daniel Küblböck
7785627,tt9230986,,,,,,,,,,,,,,Daniel Küblböck


In [6]:
data.isnull().sum()

tconst                  0
region            2297812
language          4255998
SeriesInfo        2124658
averageRating     6648811
numVotes          6648811
titleType            5260
primaryTitle         5271
isAdult              5260
startYear            5260
endYear              5260
runtimeMinutes       5260
genres               5270
DirectorName      3316067
CastName           720465
dtype: int64

In [7]:
data = data[data['primaryTitle'].notna()]

In [8]:
data['startYear'] = pd.to_numeric(data['startYear'], errors='coerce')
data['startYear'] = data['startYear'].replace(np.nan, 0, regex=True)

data['runtimeMinutes']=pd.to_numeric(data['runtimeMinutes'], errors='coerce')
data['runtimeMinutes']=data['runtimeMinutes'].replace(np.nan, 0, regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [9]:
data['endYear'] = data.apply(lambda x: str(x['endYear']).replace('\\N',str(x['startYear'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
data['endYear']=pd.to_numeric(data['endYear'], errors='coerce')
data['endYear']=data['endYear'].replace(np.nan, 0, regex=True)

data['isAdult']=pd.to_numeric(data['isAdult'], errors='coerce')
data['isAdult']=data['isAdult'].replace(np.nan, 0, regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [11]:
data[['averageRating','numVotes']]=data[['averageRating','numVotes']].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
data=data.replace(r'\\N', '', regex=True)
data=data.fillna('')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7780358 entries, 0 to 7785580
Data columns (total 15 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   region          object 
 2   language        object 
 3   SeriesInfo      object 
 4   averageRating   float64
 5   numVotes        float64
 6   titleType       object 
 7   primaryTitle    object 
 8   isAdult         float64
 9   startYear       float64
 10  endYear         float64
 11  runtimeMinutes  float64
 12  genres          object 
 13  DirectorName    object 
 14  CastName        object 
dtypes: float64(6), object(9)
memory usage: 949.8+ MB


In [13]:
data.isnull().sum()

tconst            0
region            0
language          0
SeriesInfo        0
averageRating     0
numVotes          0
titleType         0
primaryTitle      0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
DirectorName      0
CastName          0
dtype: int64

In [14]:
x = data.duplicated(subset=['region','language','SeriesInfo','averageRating','numVotes','titleType','primaryTitle',
                            'isAdult','startYear','endYear','runtimeMinutes','genres','DirectorName','CastName']).sum()
print("No of Duplicate Values", x)
data.drop_duplicates(subset = ['region','language','SeriesInfo','averageRating','numVotes','titleType','primaryTitle',
                               'isAdult','startYear','endYear','runtimeMinutes','genres','DirectorName','CastName'], keep = False)

No of Duplicate Values 1342


Unnamed: 0,tconst,region,language,SeriesInfo,averageRating,numVotes,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres,DirectorName,CastName
0,tt0000001,UA DE HU GR RU US JP,ja,,5.6,1696.0,short,Carmencita,0.0,1894.0,1894.0,1.0,"Documentary,Short",William K.L. Dickson,Carmencita William K.L. Dickson William Heise
1,tt0000002,HU FR DE RO RU US JP,ja,,6.0,210.0,short,Le clown et ses chiens,0.0,1892.0,1892.0,5.0,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
2,tt0000003,RO HU JP UA RU GB FR DE,ja,,6.5,1443.0,short,Pauvre Pierrot,0.0,1892.0,1892.0,4.0,"Animation,Comedy,Romance",Émile Reynaud,Émile Reynaud Julien Pappé Gaston Paulin Tamar...
3,tt0000004,DE RO FR RU JP HU,ja,,6.1,122.0,short,Un bon bock,0.0,1892.0,1892.0,12.0,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
4,tt0000005,GB US UA RU DE HU,,,6.1,2243.0,short,Blacksmith Scene,0.0,1893.0,1893.0,1.0,"Comedy,Short",William K.L. Dickson,Charles Kayser John Ott William K.L. Dickson T...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7785576,tt9914646,,,,0.0,0.0,short,Plan C,0.0,2019.0,2019.0,0.0,"Comedy,Short",Alexia Dalla Rosa Madison Graves Justin Settem...,Andrea Pavlovic Tej Sangani Alexia Dalla Rosa ...
7785577,tt9915800,,,,0.0,0.0,short,Hilal (The Crescent),0.0,2018.0,2018.0,8.0,Short,,
7785578,tt9915808,,,,0.0,0.0,movie,Salt Lake,0.0,0.0,0.0,0.0,"Drama,Thriller",Luke Creely,Nicole Pastor Nick Bracks Emma Louise Bournes ...
7785579,tt9916178,,,,0.0,0.0,movie,Yesterday's Dreams,0.0,0.0,0.0,0.0,,,


In [15]:
data.to_csv('D:/My files/Projects/Project Classifier MRS/rawData.csv')

### Read pre-processed data

In [16]:
data=pd.read_csv('D:/My files/Projects/Project Classifier MRS/rawData.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,tconst,region,language,SeriesInfo,averageRating,numVotes,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres,DirectorName,CastName
0,0,tt0000001,UA DE HU GR RU US JP,ja,,5.6,1696.0,short,Carmencita,0.0,1894.0,1894.0,1.0,"Documentary,Short",William K.L. Dickson,Carmencita William K.L. Dickson William Heise
1,1,tt0000002,HU FR DE RO RU US JP,ja,,6.0,210.0,short,Le clown et ses chiens,0.0,1892.0,1892.0,5.0,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
2,2,tt0000003,RO HU JP UA RU GB FR DE,ja,,6.5,1443.0,short,Pauvre Pierrot,0.0,1892.0,1892.0,4.0,"Animation,Comedy,Romance",Émile Reynaud,Émile Reynaud Julien Pappé Gaston Paulin Tamar...
3,3,tt0000004,DE RO FR RU JP HU,ja,,6.1,122.0,short,Un bon bock,0.0,1892.0,1892.0,12.0,"Animation,Short",Émile Reynaud,Émile Reynaud Gaston Paulin
4,4,tt0000005,GB US UA RU DE HU,,,6.1,2243.0,short,Blacksmith Scene,0.0,1893.0,1893.0,1.0,"Comedy,Short",William K.L. Dickson,Charles Kayser John Ott William K.L. Dickson T...


In [None]:
data=data.drop(['Unnamed: 0'],axis=1)

In [None]:
data.info()

In [None]:
data['genres'].value_counts().head(15)

In [None]:
df=data.copy()

In [None]:
df=df.applymap(str)
df.info()

### Filtering data according to our requirements

In [None]:
df['description'] = a['averageRating']+a['numVotes']+a['directors']+a['writers']+a['titleType']+a['originalTitle']+a['isAdult']+a['startYear']+a['endYear']+a['runtimeMinutes']+a['genres']+a['nconst']+a['region']+a['language']

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(a['description'])

In [None]:
tfidf_matrix.shape

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
a = a.reset_index()
titles = a['originalTitle']
indices = pd.Series(a.index, index=a['originalTitle'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
#movie_recommend('King Kong').head(10)
get_recommendations('King Kong').head(10)

In [None]:
a.loc[[16032,10612,8134,12441,7082,8834,5009,7847,16384,11691]]

In [None]:
finalTable.loc[531673]