## Data Preparation

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('youtube_scraped.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Video id,Title,Description,Category
0,0,e2NQE41J5eM,How do I travel so much ! How do I earn money!!,SUBSCRIBE - https://goo.gl/dEtSMJ ('MountainTr...,Travel and Blogs
1,1,i9E_Blai8vk,TRAVEL VLOG ∙ Welcome to Bali | PRISCILLA LEE,I had the chance to fly out to Bali with my wh...,Travel and Blogs
2,2,7ByoBJYXU0k,5 Steps to Becoming a Travel Blogger,"Travel blogger, Nikki Vargas, of The Pin the M...",Travel and Blogs
3,3,EthqIhPtd2I,"TRAVEL VLOG: SANTORINI, GREECE",Thank you so much for watching! I hope you fou...,Travel and Blogs
4,4,ehmsJLZlCZ0,Ep 1| Travelling through North East India | Of...,"The journey to Arunachal, North East India beg...",Travel and Blogs


In [2]:
df['Category'].unique()
df.dropna(inplace=True)

### Label Encoding of the target label Category

In [3]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df['targets'] = le.fit_transform(df['Category'])
df.head()

Unnamed: 0.1,Unnamed: 0,Video id,Title,Description,Category,targets
0,0,e2NQE41J5eM,How do I travel so much ! How do I earn money!!,SUBSCRIBE - https://goo.gl/dEtSMJ ('MountainTr...,Travel and Blogs,5
1,1,i9E_Blai8vk,TRAVEL VLOG ∙ Welcome to Bali | PRISCILLA LEE,I had the chance to fly out to Bali with my wh...,Travel and Blogs,5
2,2,7ByoBJYXU0k,5 Steps to Becoming a Travel Blogger,"Travel blogger, Nikki Vargas, of The Pin the M...",Travel and Blogs,5
3,3,EthqIhPtd2I,"TRAVEL VLOG: SANTORINI, GREECE",Thank you so much for watching! I hope you fou...,Travel and Blogs,5
4,4,ehmsJLZlCZ0,Ep 1| Travelling through North East India | Of...,"The journey to Arunachal, North East India beg...",Travel and Blogs,5


In [4]:
list(le.classes_)

['Art and Music',
 'Food',
 'History',
 'Manufacturing',
 'Science and Technology',
 'Travel and Blogs']

In [5]:
le.transform(['Travel and Blogs', 'Food'])

array([5, 1])

### Train Test split of the data

In [6]:
# Thus, all my categories have almost equal number of instances/samples

np.bincount(df['targets'])

array([1864, 2000, 2000, 1922, 1887, 1997])

In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Description'],
                                                   df['targets'],
                                                   random_state=0)

In [8]:
print('X_train first entry:\n\n', X_train.iloc[1])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Pendrive_Courses for Various Govt. Exams. Click here to know more - https://goo.gl/aTFK6Q or #Call_9580048004 UPSCIQ - A Magazine for UPSC IAS ...


X_train shape:  (8752,)


### To one hot encoding of targets

In [9]:
def to_one_hot(labels, dimension=6):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1
        return results
    
one_hot_train_labels = to_one_hot(y_train)
one_hot_test_labels = to_one_hot(y_test)

In [10]:
one_hot_train_labels

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## CountVectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [12]:
vect.get_feature_names()[::200] #Feature names

['00',
 'appointed',
 'buzybeez',
 'culture',
 'england',
 'gibt',
 'inside',
 'mains',
 'nostra',
 'prepper',
 'sayajigunj',
 'stressed',
 'types',
 'yoga']

In [13]:
len(vect.get_feature_names())

2688

In [14]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<8752x2688 sparse matrix of type '<class 'numpy.int64'>'
	with 174865 stored elements in Compressed Sparse Row format>

# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(multi_class='ovr')
model.fit(X_train_vectorized, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
print('Test Accuracy: ', model.score(vect.transform(X_test), y_test))

Test Accuracy:  0.9996572995202193


In [17]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['history' 'food' 'manufacturing' 'technology' 'travel' 'science' 'india'
 'has' 'as' 'more']

Largest Coefs: 
['art' 'music' 'source' 'uyysfqr1mrq' 'sleep' 'youtube' 'antonio' 'watch'
 'russo' 'better']


# Naive Bayes Classifier (Using multinomial naive bayes)

In [18]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

print('Test Accuracy: ', model.score(vect.transform(X_test), y_test))

Test Accuracy:  0.9989718985606579


## Tfidf - term frequency inverse weighting

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

2369

In [20]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression(multi_class='ovr')
model.fit(X_train_vectorized, y_train)

model.score(vect.transform(X_test), y_test)



0.9986291980808774

In [21]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['awesome' 'shops' 'restaurants' 'cooks' 'uncle' 'cute' 'kids' 'pink'
 'gave' 'just']

Largest tfidf: 
['science' 'prague' 'group' 'village' 'against' 'cheese' 'steel' 'channel'
 'lead' 'production']


In [22]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['history' 'food' 'travel' 'technology' 'manufacturing' 'science' 'to'
 'we' 'as' 'more']

Largest Coefs: 
['art' 'music' 'by' 'source' 'uyysfqr1mrq' 'sleep' 'paintings' 'artist'
 'antonio' 'russo']


## n-grams

In [23]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

7618

In [24]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression(multi_class='ovr')
model.fit(X_train_vectorized, y_train)

model.score(vect.transform(X_test), y_test)

0.9996572995202193

In [25]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['history' 'food' 'manufacturing' 'technology' 'for' 'travel' 'science'
 'factory' 'in' 'more']

Largest Coefs: 
['art' 'music' 'art and' 'com watch' 'artist' 'source https' 'source'
 'watch uyysfqr1mrq' 'uyysfqr1mrq' 'and music']


# SVM

In [65]:
from sklearn import svm
model = svm.SVC(C= 10)
model.fit(X_train_vectorized, y_train)

model.score(vect.transform(X_test), y_test)



0.997943797121316

In [66]:
from sklearn.metrics import precision_recall_fscore_support

predictions = model.predict(vect.transform(X_test))
precision, recall, f_score, support = precision_recall_fscore_support(y_test, predictions )
precision_recall_fscore_support(y_test, predictions )
#Precision, Recall, F-score and Support for each class

(array([0.99568966, 0.99808795, 0.9940239 , 1.        , 1.        ,
        1.        ]),
 array([0.99784017, 0.99428571, 0.998     , 0.99798793, 1.        ,
        1.        ]),
 array([0.99676375, 0.99618321, 0.99600798, 0.99899295, 1.        ,
        1.        ]),
 array([463, 525, 500, 497, 465, 468]))

In [67]:
df = pd.DataFrame([le.classes_, precision, recall, f_score]).T

In [69]:
df.rename(columns={ 0: 'Category', 1:'precision', 2:'recall', 3: 'f_score'}, inplace = True)

In [70]:
df

Unnamed: 0,Category,precision,recall,f_score
0,Art and Music,0.99569,0.99784,0.996764
1,Food,0.998088,0.994286,0.996183
2,History,0.994024,0.998,0.996008
3,Manufacturing,1.0,0.997988,0.998993
4,Science and Technology,1.0,1.0,1.0
5,Travel and Blogs,1.0,1.0,1.0
