In [10]:
# Future Imports
from __future__ import division, print_function, absolute_import, unicode_literals

# Data Cleaning
from pandas import read_csv, DataFrame, pivot_table
from numpy import NaN, array

# Pairwise-Distance
from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import correlation

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Removing Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def dataset_load(filepath):
    
    filepath = "./" +filepath+ '.csv'
    dataframe = read_csv(filepath)
    return dataframe

dataframe = dataset_load('reddit_data')

In [None]:
dataframe.head()

In [None]:
dataframe.info()

In [None]:
dataframe.describe(include=['O'])

#### Total Number of Unique Users are 22610
#### Total Number of Unique Subreddits are 34967

In [None]:
username = dataframe.groupby(['username'])['subreddit'].nunique() #.value_counts()

In [None]:
subreddits = dataframe.groupby(['subreddit'])['username'].nunique()

In [None]:
top_ten_users = username.sort_values(ascending=False)[:10]

### Users with highest Subreddit Count

In [None]:
p = top_ten_users.plot(kind='bar', figsize=(12,5), color=sns.color_palette()[0], fontsize=10)
_ = p.set(xlabel='Username', ylabel='Count of SubReddits')

In [None]:
top_ten_subreddits = subreddits.sort_values(ascending=False)[:10]

### Subreddits with highest Frequency

In [None]:
p = top_ten_subreddits.plot(kind='bar', figsize=(12,5), color=sns.color_palette()[3], fontsize=10)
_ = p.set(ylabel='Count of Users', xlabel='SubReddits')

### Finding Out Null Entries

In [None]:
dataframe.isnull().loc[True].sum()

In [45]:
dataframe_second = dataframe.copy(deep=True)

In [47]:
# you can also use data.sum(axis=1) here
def dataframe_sampling(data):
    dataframe_third = DataFrame(data.groupby(['username'])['subreddit'].value_counts().reset_index(level=0), \
                            columns=['username', 'subreddit'])
    dataframe_third.rename(columns={'subreddit':'count'}, inplace=True)
    dataframe_third = dataframe_third.reset_index()
    data = dataframe_third.sample(frac=0.01)
    data = data.reset_index()
    data = data.drop('index', axis=1)
    pivot_table = data.pivot('subreddit', 'username', 'count').reset_index()
    pivot_table.fillna(value=0, inplace=True)
    return dataframe_third, data, pivot_table

dataframe_third, data, pivot_table = dataframe_sampling(dataframe_second)

In [50]:
pivot_table.subreddit.shape

(3160,)

### Popularity Based Collaborative Filtering Technique (PBCFT)

In [5]:
data2 = data.copy(deep=True)

In [6]:
def popularity_recommendation(data):
    item_grouped = data.groupby(['subreddit']).agg({'username': 'count'}).reset_index()
    item_grouped.rename(columns={'username': 'score'}, inplace=True)
    item_grouped_sorted = item_grouped.sort_values(['score', 'subreddit'], ascending=False)
    item_grouped_sorted['rank'] = item_grouped_sorted['score'].rank(ascending=False, method='first')
    popularity_recomm = item_grouped_sorted.head(10)
    return popularity_recomm

In [None]:
popularity_recomm = popularity_recommendation(data2)

In [None]:
def recommend(recommend):
    recommend['username'] = raw_input('')
    cols = recommend.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    recommend = recommend[cols]
    recommend.reset_index(inplace=True)
    recommend = recommend.drop('index', axis=1)
    return recommend

In [None]:
recommend(popularity_recomm)

### Item-Item Based Colaborative Filtering Technique (i-iBCFT)

In [71]:
def Distance_of_item(pivot, metric):
    item_pivot_table = pivot.drop('subreddit', axis=1)
#     new_item_pivot_table = DataFrame(index=item_pivot_table.columns, columns=item_pivot_table.columns)
#     item_pivot_table.fillna(0, inplace=True)
    item_pivot_table_distances = 1 - pairwise_distances(item_pivot_table.as_matrix(), metric=metric)
    new_item_pivot_table_distances = DataFrame(item_pivot_table_distances, columns=pivot.subreddit, index=pivot.subreddit)
    return new_item_pivot_table_distances

In [76]:
new_item_pivot_table_correlation_distances = Distance_of_item(pivot_table, 'correlation')
new_item_pivot_table_correlation_distances.head()

subreddit,1200isplenty,195,2007scape,22lr,240sx,24hoursupport,2meirl4meirl,30ROCK,365PhotoProject,3DS,...,youtube,youtubehaiku,yoyhammer,yugijerk,yugioh,yuri,yuruyuri,zelda,zooeydeschanel,zsh
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1200isplenty,1.0,-0.000158,-0.000183,-0.000158,-0.000158,-0.000256,-0.000162,-0.000223,-0.000158,-0.000158,...,-0.000158,-0.000329,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158
195,-0.000158,1.0,-0.000183,-0.000158,-0.000158,-0.000256,-0.000162,-0.000223,-0.000158,-0.000158,...,-0.000158,-0.000329,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158
2007scape,-0.000183,-0.000183,1.0,-0.000183,-0.000183,-0.000297,-0.000188,-0.000259,-0.000183,-0.000183,...,-0.000183,-0.000382,-0.000183,-0.000183,-0.000183,-0.000183,-0.000183,-0.000183,-0.000183,-0.000183
22lr,-0.000158,-0.000158,-0.000183,1.0,-0.000158,-0.000256,-0.000162,-0.000223,-0.000158,-0.000158,...,-0.000158,-0.000329,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158
240sx,-0.000158,-0.000158,-0.000183,-0.000158,1.0,-0.000256,-0.000162,-0.000223,-0.000158,-0.000158,...,-0.000158,-0.000329,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158,-0.000158


In [77]:
new_item_pivot_table_correlation_distances.soccer.sort_values(ascending=False)[0:10]

subreddit
soccer              1.000000
notinteresting      0.047670
madlads             0.004427
PS4                 0.002086
MarksmanMains       0.001703
NoMansSkyTheGame    0.000422
cringe              0.000304
playstation         0.000087
INTP               -0.000354
Hotwife            -0.000354
Name: soccer, dtype: float64

In [75]:
new_item_pivot_table_cosine_distances = Distance_of_item(pivot_table, 'cosine')

In [78]:
new_item_pivot_table_cosine_distances.soccer.sort_values(ascending=False)[0:10]

subreddit
soccer              1.000000
notinteresting      0.048160
madlads             0.005483
PS4                 0.002927
MarksmanMains       0.002056
NoMansSkyTheGame    0.001282
cringe              0.001095
playstation         0.000620
gaming              0.000011
Phlebology          0.000000
Name: soccer, dtype: float64

In [79]:
new_item_pivot_table_jaccard_distances = Distance_of_item(pivot_table, 'jaccard')

In [80]:
new_item_pivot_table_jaccard_distances.soccer.sort_values(ascending=False)[0:10]

subreddit
soccer              1.000000
MarksmanMains       0.062500
playstation         0.055556
notinteresting      0.047619
cringe              0.037037
madlads             0.037037
PS4                 0.037037
NoMansSkyTheGame    0.037037
gaming              0.009524
Phlebology          0.000000
Name: soccer, dtype: float64