# Part I: Data Gathering and Preprocessing

### Importing Libraries

In [9]:
import numpy as np
import pandas as pd
import collections
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Importing scikit-learn classifiers

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


### Importing Data

In [11]:
video = pd.read_csv("C:/Users/nitis/Downloads/SJSU/SJSU Spring'19/255/Project/youtube-new/USvideos.csv", header=0)
video.head(5)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


### Deleting unused columns and renaming the remaining columns

In [12]:
new_columns = ['title', 'category_id']
new_video = video[new_columns]
new_video.to_csv("C:/Users/nitis/Downloads/SJSU/SJSU Spring'19/255/Project/youtube-new/new_Usvideos.csv", index=False)
new_video = pd.read_csv("C:/Users/nitis/Downloads/SJSU/SJSU Spring'19/255/Project/youtube-new/new_Usvideos.csv", header=0, names=['Title', 'Category_ID'])

### Importing JSON file

In [13]:
category_json = pd.read_json("C:/Users/nitis/Downloads/SJSU/SJSU Spring'19/255/Project/youtube-new/US_category_id.JSON")
category_json.head(5)

Unnamed: 0,kind,etag,items
0,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
1,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
2,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
3,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
4,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."


### Creating a list of Dictionaries with ID and Category label mapping

In [14]:
category_dict = [{'id': item['id'], 'title': item['snippet']['title']} for item in category_json['items']]
category_dict

[{'id': '1', 'title': 'Film & Animation'},
 {'id': '2', 'title': 'Autos & Vehicles'},
 {'id': '10', 'title': 'Music'},
 {'id': '15', 'title': 'Pets & Animals'},
 {'id': '17', 'title': 'Sports'},
 {'id': '18', 'title': 'Short Movies'},
 {'id': '19', 'title': 'Travel & Events'},
 {'id': '20', 'title': 'Gaming'},
 {'id': '21', 'title': 'Videoblogging'},
 {'id': '22', 'title': 'People & Blogs'},
 {'id': '23', 'title': 'Comedy'},
 {'id': '24', 'title': 'Entertainment'},
 {'id': '25', 'title': 'News & Politics'},
 {'id': '26', 'title': 'Howto & Style'},
 {'id': '27', 'title': 'Education'},
 {'id': '28', 'title': 'Science & Technology'},
 {'id': '29', 'title': 'Nonprofits & Activism'},
 {'id': '30', 'title': 'Movies'},
 {'id': '31', 'title': 'Anime/Animation'},
 {'id': '32', 'title': 'Action/Adventure'},
 {'id': '33', 'title': 'Classics'},
 {'id': '34', 'title': 'Comedy'},
 {'id': '35', 'title': 'Documentary'},
 {'id': '36', 'title': 'Drama'},
 {'id': '37', 'title': 'Family'},
 {'id': '38', '

### Creating a DataFrame for the Dictionary

In [15]:
category_df = pd.DataFrame(category_dict)
categories = category_df.rename(index=str, columns = {"id":"Category_ID","title":"Category"})
categories.head(5)

Unnamed: 0,Category_ID,Category
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


# Part II: Training

### Splitting 'title' into string of words using CountVectorizer

In [16]:
vector = CountVectorizer()
counts = vector.fit_transform(new_video['Title'].values)

### Using various classification models and targetting 'Category'

In [17]:
NB_Model = MultinomialNB()
RFC_Model = RandomForestClassifier()
SVC_Model = SVC()
KNC_Model = KNeighborsClassifier()
DTC_Model = DecisionTreeClassifier()

In [18]:
output = new_video['Category_ID'].values

In [20]:
NB_Model.fit(counts,output)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
RFC_Model.fit(counts,output)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
SVC_Model.fit(counts,output)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [23]:
KNC_Model.fit(counts,output)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [24]:
DTC_Model.fit(counts,output)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Checking the accuracy using 90/10 train/test split

In [25]:
X = counts
Y = output
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .1)

In [26]:
NBtest = MultinomialNB().fit(X_train,Y_train)
nb_predictions = NBtest.predict(X_test)
acc_nb = NBtest.score(X_test, Y_test)
print('The Naive Bayes Algorithm has an accuracy of', acc_nb)

The Naive Bayes Algorithm has an accuracy of 0.8981684981684982


In [27]:
RFCtest = RandomForestClassifier().fit(X_train,Y_train)
rfc_predictions = RFCtest.predict(X_test)
acc_rfc = RFCtest.score(X_test, Y_test)
print('The Random Forest Algorithm has an accuracy of', acc_rfc)



The Random Forest Algorithm has an accuracy of 0.9870573870573871


In [28]:
SVCtest = SVC().fit(X_train,Y_train)
svc_predictions = SVCtest.predict(X_test)
acc_svc = SVCtest.score(X_test, Y_test)
print('The Support Vector Algorithm has an accuracy of', acc_svc)



The Support Vector Algorithm has an accuracy of 0.2463980463980464


In [29]:
KNCtest = KNeighborsClassifier().fit(X_train,Y_train)
knc_predictions = KNCtest.predict(X_test)
acc_knc = KNCtest.score(X_test, Y_test)
print('The K Neighbors Algorithm has an accuracy of', acc_knc)

The K Neighbors Algorithm has an accuracy of 0.9477411477411477


In [30]:
DTCtest = DecisionTreeClassifier().fit(X_train,Y_train)
dtc_predictions = DTCtest.predict(X_test)
acc_dtc = DTCtest.score(X_test, Y_test)
print('The Decision Tree Algorithm has an accuracy of', acc_dtc)

The Decision Tree Algorithm has an accuracy of 0.9885225885225886


# Part III: Test

### Entering hypothetical titles to predict the category

In [31]:
Titles = ["Hilarious cat plays with toy",
        "Best fashion looks for Spring 2018",
        "Olympics opening ceremony highlights",
        "Warriors basketball game versus the cavs",
        "CNN world news on donald trump",
        "Police Chase in Hollywood",
        "Ed Sheeran - Perfect (Official Music Video)",
        "how to do eyeshadow",
        "Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goals Resumen & Goles 2019 HD",
        "SPIDER-MAN: FAR FROM HOME - Official Trailer",
          "Chain Restaurant Steak Taste Test",
          "ASSUMPTIONS ABOUT CLICK!"
         ]

### Inserting above titles into each classifier model

In [32]:
Titles_counts = vector.transform(Titles)

### Naive Bayes Model

In [33]:
PredictNB = NB_Model.predict(Titles_counts)
PredictNB

array([24, 24, 17, 17, 25, 26, 10, 26, 17,  1, 26, 24], dtype=int64)

### Random Forest Model

In [34]:
PredictRFC = RFC_Model.predict(Titles_counts)
PredictRFC

array([15, 24, 17, 10, 25, 25, 10, 26, 24,  1, 24, 24], dtype=int64)

### SVC Model

In [35]:
PredictSVC = SVC_Model.predict(Titles_counts)
PredictSVC

array([24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24], dtype=int64)

### K Neighbors Model

In [36]:
PredictKNC = KNC_Model.predict(Titles_counts)
PredictKNC

array([23,  2, 23, 23, 24, 22, 10, 23, 23, 24, 24,  1], dtype=int64)

### Decision Tree Model

In [37]:
PredictDTC = DTC_Model.predict(Titles_counts)
PredictDTC

array([15, 22, 17, 28, 23, 24, 10, 25, 17, 24, 28, 22], dtype=int64)

### Output will be an array of numbers. Iterate through the Category Dictionary (from JSON file) to find "title"

In [38]:
CategoryNamesListNB = []
for Category_ID in PredictNB:
    MatchingCategoriesNB = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesNB:
        CategoryNamesListNB.append(MatchingCategoriesNB[0]["title"])

In [39]:
CategoryNamesListRFC = []
for Category_ID in PredictRFC:
    MatchingCategoriesRFC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesRFC:
        CategoryNamesListRFC.append(MatchingCategoriesRFC[0]["title"])

In [40]:
CategoryNamesListSVC = []
for Category_ID in PredictSVC:
    MatchingCategoriesSVC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesSVC:
        CategoryNamesListSVC.append(MatchingCategoriesSVC[0]["title"])

In [41]:
CategoryNamesListKNC = []
for Category_ID in PredictKNC:
    MatchingCategoriesKNC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesKNC:
        CategoryNamesListKNC.append(MatchingCategoriesKNC[0]["title"])

In [42]:
CategoryNamesListDTC = []
for Category_ID in PredictDTC:
    MatchingCategoriesDTC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesDTC:
        CategoryNamesListDTC.append(MatchingCategoriesDTC[0]["title"])

### Mapping these values to the Titles we want to Predict

In [43]:
TitleDataFrameNB = []
for i in range(0, len(Titles)):
    TitleToCategoriesNB = {'Title': Titles[i],  'Category': CategoryNamesListNB[i]}
    TitleDataFrameNB.append(TitleToCategoriesNB)

In [44]:
TitleDataFrameRFC = []
for i in range(0, len(Titles)):
    TitleToCategoriesRFC = {'Title': Titles[i],  'Category': CategoryNamesListRFC[i]}
    TitleDataFrameRFC.append(TitleToCategoriesRFC)

In [45]:
TitleDataFrameSVC = []
for i in range(0, len(Titles)):
    TitleToCategoriesSVC = {'Title': Titles[i],  'Category': CategoryNamesListSVC[i]}
    TitleDataFrameSVC.append(TitleToCategoriesSVC)

In [46]:
TitleDataFrameKNC = []
for i in range(0, len(Titles)):
    TitleToCategoriesKNC = {'Title': Titles[i],  'Category': CategoryNamesListKNC[i]}
    TitleDataFrameKNC.append(TitleToCategoriesKNC)

In [47]:
TitleDataFrameDTC = []
for i in range(0, len(Titles)):
    TitleToCategoriesDTC = {'Title': Titles[i],  'Category': CategoryNamesListDTC[i]}
    TitleDataFrameDTC.append(TitleToCategoriesDTC)

### Converting the resulting Dictionary to a Data Frame

In [48]:
PredictDFnb = pd.DataFrame(PredictNB)
TitleDFnb = pd.DataFrame(TitleDataFrameNB)
PreFinalDFnb = pd.concat([PredictDFnb, TitleDFnb], axis=1)
PreFinalDFnb.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFnb = PreFinalDFnb.drop(['Categ_ID'],axis=1)
colsNB = FinalDFnb.columns.tolist()
colsNB = colsNB[-1:] + colsNB[:-1]
FinalDFnb= FinalDFnb[colsNB]

In [49]:
PredictDFrfc = pd.DataFrame(PredictRFC)
TitleDFrfc = pd.DataFrame(TitleDataFrameRFC)
PreFinalDFrfc = pd.concat([PredictDFrfc, TitleDFrfc], axis=1)
PreFinalDFrfc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFrfc = PreFinalDFrfc.drop(['Categ_ID'],axis=1)
colsRFC = FinalDFrfc.columns.tolist()
colsRFC = colsRFC[-1:] + colsRFC[:-1]
FinalDFrfc= FinalDFrfc[colsRFC]

In [50]:
PredictDFsvc = pd.DataFrame(PredictSVC)
TitleDFsvc = pd.DataFrame(TitleDataFrameSVC)
PreFinalDFsvc = pd.concat([PredictDFsvc, TitleDFsvc], axis=1)
PreFinalDFsvc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFsvc = PreFinalDFsvc.drop(['Categ_ID'],axis=1)
colsSVC = FinalDFsvc.columns.tolist()
colsSVC = colsSVC[-1:] + colsSVC[:-1]
FinalDFsvc= FinalDFsvc[colsSVC]

In [51]:
PredictDFknc = pd.DataFrame(PredictKNC)
TitleDFknc = pd.DataFrame(TitleDataFrameKNC)
PreFinalDFknc = pd.concat([PredictDFknc, TitleDFknc], axis=1)
PreFinalDFknc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFknc = PreFinalDFknc.drop(['Categ_ID'],axis=1)
colsKNC = FinalDFknc.columns.tolist()
colsKNC = colsKNC[-1:] + colsKNC[:-1]
FinalDFknc= FinalDFknc[colsKNC]

In [52]:
PredictDFdtc = pd.DataFrame(PredictDTC)
TitleDFdtc = pd.DataFrame(TitleDataFrameDTC)
PreFinalDFdtc = pd.concat([PredictDFdtc, TitleDFdtc], axis=1)
PreFinalDFdtc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFdtc = PreFinalDFdtc.drop(['Categ_ID'],axis=1)
colsDTC = FinalDFdtc.columns.tolist()
colsDTC = colsDTC[-1:] + colsDTC[:-1]
FinalDFdtc= FinalDFdtc[colsDTC]

### Viewing the Final Prediction Results

In [53]:
# Naive Bayes 
FinalDFnb

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Hilarious cat plays with toy,Entertainment
1,Best fashion looks for Spring 2018,Entertainment
2,Olympics opening ceremony highlights,Sports
3,Warriors basketball game versus the cavs,Sports
4,CNN world news on donald trump,News & Politics
5,Police Chase in Hollywood,Howto & Style
6,Ed Sheeran - Perfect (Official Music Video),Music
7,how to do eyeshadow,Howto & Style
8,Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goal...,Sports
9,SPIDER-MAN: FAR FROM HOME - Official Trailer,Film & Animation


In [54]:
# Random Forest
FinalDFrfc

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Hilarious cat plays with toy,Pets & Animals
1,Best fashion looks for Spring 2018,Entertainment
2,Olympics opening ceremony highlights,Sports
3,Warriors basketball game versus the cavs,Music
4,CNN world news on donald trump,News & Politics
5,Police Chase in Hollywood,News & Politics
6,Ed Sheeran - Perfect (Official Music Video),Music
7,how to do eyeshadow,Howto & Style
8,Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goal...,Entertainment
9,SPIDER-MAN: FAR FROM HOME - Official Trailer,Film & Animation


In [55]:
# SVC
FinalDFsvc

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Hilarious cat plays with toy,Entertainment
1,Best fashion looks for Spring 2018,Entertainment
2,Olympics opening ceremony highlights,Entertainment
3,Warriors basketball game versus the cavs,Entertainment
4,CNN world news on donald trump,Entertainment
5,Police Chase in Hollywood,Entertainment
6,Ed Sheeran - Perfect (Official Music Video),Entertainment
7,how to do eyeshadow,Entertainment
8,Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goal...,Entertainment
9,SPIDER-MAN: FAR FROM HOME - Official Trailer,Entertainment


In [56]:
# K Neighbors
FinalDFknc

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Hilarious cat plays with toy,Comedy
1,Best fashion looks for Spring 2018,Autos & Vehicles
2,Olympics opening ceremony highlights,Comedy
3,Warriors basketball game versus the cavs,Comedy
4,CNN world news on donald trump,Entertainment
5,Police Chase in Hollywood,People & Blogs
6,Ed Sheeran - Perfect (Official Music Video),Music
7,how to do eyeshadow,Comedy
8,Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goal...,Comedy
9,SPIDER-MAN: FAR FROM HOME - Official Trailer,Entertainment


In [57]:
# Decision Trees
FinalDFdtc

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Hilarious cat plays with toy,Pets & Animals
1,Best fashion looks for Spring 2018,People & Blogs
2,Olympics opening ceremony highlights,Sports
3,Warriors basketball game versus the cavs,Science & Technology
4,CNN world news on donald trump,Comedy
5,Police Chase in Hollywood,Entertainment
6,Ed Sheeran - Perfect (Official Music Video),Music
7,how to do eyeshadow,News & Politics
8,Livеrрооl vs Ваrсеlоnа 4−0 − Highlights & Goal...,Sports
9,SPIDER-MAN: FAR FROM HOME - Official Trailer,Entertainment
