In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

In [2]:
!ls -lt *.csv

-rw-r--r-- 1 deon deon 2483723 Feb  8 17:59 ratings.csv
-rw-r--r-- 1 deon deon  494431 Feb  8 17:59 movies.csv
-rw-r--r-- 1 deon deon 3811669 Jan 15 11:47 income_evaluation.csv
-rw-r--r-- 1 deon deon 4174304 Jan  9 11:58 credit_card_default.csv
-rw-r--r-- 1 deon deon   96126 Aug 28 09:41 yield.csv
-rw-r--r-- 1 deon deon 2475934 Jul 15  2019 kc_house_data.csv
-rw-r--r-- 1 deon deon  146724 Jun 24  2019 NHIS 2007 data.csv
-rw-r--r-- 1 deon deon  672305 Jun 24  2019 super_hero_powers.csv
-rw-r--r-- 1 deon deon   49195 Jun 24  2019 heroes_information.csv
-rw-r--r-- 1 deon deon  165456 Jun 24  2019 WorldCupMatches.csv
-rw-r--r-- 1 deon deon     544 Jun 21  2019 purchases.csv


In [6]:
df_ratings = pd.read_csv('./ratings.csv')
df_ratings.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [7]:
df_movies = pd.read_csv('./movies.csv')
df_movies.head(20)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [24]:
df_movies['genres'].nunique()

951

In [8]:
from sklearn.linear_model import LogisticRegression

This [movie, ranking] dataset is open-sourced and available for download at the [UCI Machine Learning Repository]: [URL]

It was originally created by: [NAME(S)]  [Title]  [Date]

In [11]:
df_merged = pd.merge(df_movies, df_ratings, on='movieId')
df_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [20]:
movieId_1 = df_merged[df_merged['movieId'] == 1]   
np.mean(movieId_1['rating'])

3.9209302325581397

In [21]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
movieId      100836 non-null int64
title        100836 non-null object
genres       100836 non-null object
userId       100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


> Singular Value Decomposition (SVD)
- A linear Algebra method that can decompose a utility matrix into three compressed matrices
- Model-based recommendr - use these compressed matrices to make a recommendation without having to refer back to the complete data set
- Latent variables - inferred, non-observable
- A = u * S * v

## Model-based Collaborative Filtering System

### SVD Matrix Factorization

In [22]:
from sklearn.decomposition import TruncatedSVD   # Note: Utility can not contain np.Nan must fill_value = 0 (see below)

In [29]:
columns =['movieId','movieTitle','releaseDate','videoRelDate','IMDb_URL','unknown','Action','Adventure','Animation',
          'Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',
         'Sci-Fi','Thriller','War','Western']
df_item = pd.read_csv('./u.item', sep='|', encoding='latin-1', names = columns)
df_item.head()

Unnamed: 0,movieId,movieTitle,releaseDate,videoRelDate,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
# Combine dataframes
combined_movies_data = pd.merge(df_ratings, df_item, on='movieId')
combined_movies_data.tail()

Unnamed: 0,userId,movieId,rating,timestamp,movieTitle,releaseDate,videoRelDate,IMDb_URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
34592,606,1156,3.5,1171361663,Cyclo (1995),02-Aug-1996,,http://us.imdb.com/M/title-exact?Cyclo%20(1995),0,0,...,0,0,0,0,0,0,0,0,0,0
34593,606,1493,3.5,1173447143,"Modern Affair, A (1995)",06-Sep-1996,,http://us.imdb.com/M/title-exact?Modern%20Affa...,0,0,...,0,0,0,0,0,1,0,0,0,0
34594,607,241,4.0,964744490,"Last of the Mohicans, The (1992)",01-Jan-1992,,http://us.imdb.com/M/title-exact?Last%20of%20t...,0,1,...,0,0,0,0,0,1,0,0,1,0
34595,609,137,3.0,847221054,Big Night (1996),20-Sep-1996,,http://us.imdb.com/M/title-exact?Big%20Night%2...,0,0,...,0,0,0,0,0,0,0,0,0,0
34596,610,1054,2.0,1493850563,Mr. Wrong (1996),16-Feb-1996,,http://us.imdb.com/M/title-exact?Mr.%20Wrong%2...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Most number of reviews?
combined_movies_data.groupby('movieId')['rating'].count().sort_values(ascending =False).head()

movieId
356    329
318    317
296    307
593    279
260    251
Name: rating, dtype: int64

In [38]:
# Avg Rating 5.0?
combined_movies_data.groupby('movieId')['rating'].mean().sort_values(ascending =False).head()

movieId
633     5.0
467     5.0
1151    5.0
626     5.0
148     5.0
Name: rating, dtype: float64

In [39]:
# Avg Rating 1.0?
combined_movies_data.groupby('movieId')['rating'].mean().sort_values(ascending =True).head()

movieId
1328    0.75
1574    1.00
476     1.00
698     1.00
1335    1.00
Name: rating, dtype: float64

In [52]:
# Find Name of movies to associate with above data
combined_movies_data[combined_movies_data['movieId'] == 356]['movieTitle'].unique()

array(['Client, The (1994)'], dtype=object)

### Building a Utility Matrix
> Sparse Matrix

In [60]:
ratings_crosstab = combined_movies_data.pivot_table(values = 'rating', index= 'userId', columns= 'movieTitle', fill_value=0) #idex are rows
ratings_crosstab.tail()

movieTitle,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Wonderland (1997),"Wooden Man's Bride, The (Wu Kui) (1994)","Wrong Trousers, The (1993)",You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",unknown,Á köldum klaka (Cold Fever) (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,3.5,0.0,3.0,0,0,0.0,0.0,0.0,0.0,0,...,3.0,0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0
607,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0
608,0.0,0.0,0.0,0,0,0.0,3.0,0.0,0.0,0,...,0.0,0,0.0,0.5,3.0,0.0,0.0,0.0,0.0,0
609,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0
610,4.5,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### Transposing the Marix

In [61]:
ratings_crosstab.shape

(608, 1255)

In [62]:
# Using sklearn.TruncatedSVD we are going to compress the data into latent variables about movies, making it more generalized.
#   We will 1) transpose the matrix and 2) pass into the clf the n_components to compress down into.


# Transpose Matrix values
X = ratings_crosstab.values.T
X.shape

(1255, 608)

### Decompose the Matrix

In [63]:
SVD = TruncatedSVD(n_components=12, random_state=17)
result_matrix = SVD.fit_transform(X)
result_matrix.shape

(1255, 12)

### Generating a Correlation Matrix

In [65]:
# Use the Pearson Corrl to determine how similar a user's taste is based on similar user's taste for movies.
# Recommend the item that corrl the most with your movie interest, based on the generalized user tastes.

corr_matrix = np.corrcoef(result_matrix)
corr_matrix.shape

(1255, 1255)

### Isolating 'Star Wars' from the Correlation Matrix

In [66]:
movies_name = ratings_crosstab.columns
movies_name

Index([''Til There Was You (1997)', '1-900 (1994)', '101 Dalmatians (1996)',
       '12 Angry Men (1957)', '187 (1997)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '3 Ninjas: High Noon At Mega Mountain (1998)', '39 Steps, The (1935)',
       ...
       'Wonderland (1997)', 'Wooden Man's Bride, The (Wu Kui) (1994)',
       'Wrong Trousers, The (1993)', 'You So Crazy (1994)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Poisoner's Handbook, The (1995)',
       'unknown', 'Á köldum klaka (Cold Fever) (1994)'],
      dtype='object', name='movieTitle', length=1255)

In [79]:
movies_list = list(ratings_crosstab.columns)
movies_list[:10]

["'Til There Was You (1997)",
 '1-900 (1994)',
 '101 Dalmatians (1996)',
 '12 Angry Men (1957)',
 '187 (1997)',
 '2 Days in the Valley (1996)',
 '20,000 Leagues Under the Sea (1954)',
 '2001: A Space Odyssey (1968)',
 '3 Ninjas: High Noon At Mega Mountain (1998)',
 '39 Steps, The (1935)']

In [80]:
# Find a specific movie (eg. Star Wars)
star_wars_idx = movies_list.index('Star Wars (1977)')
star_wars_idx

1059

In [81]:
corr_star_wars = corr_matrix[star_wars_idx]
corr_star_wars

array([0.32822562, 0.10866697, 0.34190573, ..., 0.33298138, 0.35638163,
       0.55205697])

In [82]:
corr_matrix[star_wars_idx].shape

(1255,)

### Recommending a Highly Correlated Movie

In [87]:
list(movies_name[(corr_star_wars < 1.0) & (corr_star_wars > 0.9) ])

['Ed Wood (1994)',
 'French Twist (Gazon maudit) (1995)',
 'Guilty as Sin (1993)',
 'Promesse, La (1996)',
 "Schindler's List (1993)",
 'Stalingrad (1993)']

In [88]:
list(movies_name[(corr_star_wars == 1.0) ])

['Star Wars (1977)']

In [92]:
list(movies_name[(corr_star_wars > 0.9)])

['Ed Wood (1994)',
 'French Twist (Gazon maudit) (1995)',
 'Guilty as Sin (1993)',
 'Promesse, La (1996)',
 "Schindler's List (1993)",
 'Stalingrad (1993)',
 'Star Wars (1977)']

In [93]:
list(movies_name[(corr_star_wars < 1.0) & (corr_star_wars > 0.95) ])

['Promesse, La (1996)', "Schindler's List (1993)", 'Stalingrad (1993)']

## Content-based Recommender Systems
> Using Nearest neighbor algorithm

In [94]:
from sklearn.neighbors import NearestNeighbors

### Load Dataset (auto-mpg.data)

Source:

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The dataset was used in the 1983 American Statistical Association Exposition.


Data Set Information:

This dataset is a slightly modified version of the dataset provided in the StatLib library. In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for the "mpg" attribute. The original dataset is available in the file "auto-mpg.data-original".

"The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes." (Quinlan, 1993)


Attribute Information:

1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)

In [103]:
# auto-mpg.data datasource
import pandas as pd

columns = ['mpg','cylinders','displacement', 'horsepower', 'weight', 'acceleration', 
            'model year', 'origin', 'car name']

df_auto = pd.read_csv("./auto-mpg.data", delimiter= "\s+", header=None, names=columns)
display(df_auto.tail())


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10


### EDA

In [116]:
df_auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null float64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [121]:
df_auto['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

### Test Data

In [125]:
testData = [44, 4, 97]  # mpg, cycl, disp  -->  Row #394

In [126]:
X = df_auto.iloc[:,0:3]
X.head()

Unnamed: 0,mpg,cylinders,displacement
0,18.0,8,307.0
1,15.0,8,350.0
2,18.0,8,318.0
3,16.0,8,304.0
4,17.0,8,302.0


### Fit Model

In [127]:
clf = NearestNeighbors(n_neighbors=1)
clf.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                 radius=1.0)

In [128]:
clf.kneighbors([testData])
# 394	32.0 4 135.0 84.00 2295. 11...	dodge rampage

(array([[0.]]), array([[394]], dtype=int64))

> array[0] = 0 is the distance value (p to t) | array[1] = 394 is the recommended row.

### Make Recommendation

In [129]:
df_cars
# 394	32.0 4 135.0 84.00 2295. 11...	dodge rampage

Unnamed: 0,18.0 8 307.0 130.0 3504. 12.0 70 1,chevrolet chevelle malibu
0,15.0 8 350.0 165.0 3693. 11...,buick skylark 320
1,18.0 8 318.0 150.0 3436. 11...,plymouth satellite
2,16.0 8 304.0 150.0 3433. 12...,amc rebel sst
3,17.0 8 302.0 140.0 3449. 10...,ford torino
4,15.0 8 429.0 198.0 4341. 10...,ford galaxie 500
5,14.0 8 454.0 220.0 4354. 9...,chevrolet impala
6,14.0 8 440.0 215.0 4312. 8...,plymouth fury iii
7,14.0 8 455.0 225.0 4425. 10...,pontiac catalina
8,15.0 8 390.0 190.0 3850. 8...,amc ambassador dpl
9,15.0 8 383.0 170.0 3563. 10...,dodge challenger se
