In [3]:
# jupyter nbconvert video_category_slides.ipynb --to slides --post serve


<h1 style="color:darkblue;font-size:46px; ">Text classification<h1/>


<p style="color:darkblue;font-size:26px; ">Importing the text<p/>


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('data/videos_desc.csv')
dataset.head()

Unnamed: 0,description,category_id
0,SHANTELL'S CHANNEL - https://www.youtube.com/s...,22
1,"One year after the presidential election, John...",24
2,WATCH MY PREVIOUS VIDEO â–¶ \n\nSUBSCRIBE â–º ...,23
3,Today we find out if Link is a Nickelback amat...,24
4,I know it's been a while since we did this sho...,24


<p style="color:darkblue;font-size:26px; ">Cleaning the texts<p/>


In [2]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, 4751):
    review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i, 0])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ronny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<p style="color:darkblue;font-size:26px; ">Creating the Bag of Words model<p/>


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.loc[:, 'category_id'].values

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X).toarray()

<p style="color:darkblue;font-size:26px; ">Splitting the dataset into the Training set and Test set<p/>


In [6]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



<p style="color:darkblue;font-size:36px; ">Naive Bayes<p/>
<p style="color:darkblue;font-size:24px;padding: 10px; ">a family of simple probabilistic classifiers based on applying Bayes' theorem with strong (naive) independence assumptions between the features.<p/>
<a style="padding: 10px;" src='https://en.wikipedia.org/wiki/Naive_Bayes_classifier'>wikpedia</a>

In [7]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.5772870662460567


<p style="color:darkblue;font-size:36px; ">SVM<p/>
<p style="color:darkblue;font-size:24px;padding: 10px; ">a support vector machine constructs a hyperplane or set of hyperplanes in a high- or infinite-dimensional space, which can be used for classification, regression, or other tasks like outliers detection<p/>
<a style="padding: 10px;" src='https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm'>wikpedia</a>
<img src='https://upload.wikimedia.org/wikipedia/commons/b/b5/Svm_separating_hyperplanes_%28SVG%29.svg' height="150" width="150">


In [8]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.9369085173501577


<p style="color:darkblue;font-size:36px; ">k-NN<p/>
<p style="color:darkblue;font-size:24px;padding: 10px; ">a non-parametric method used for classification and regression. In both cases, the input consists of the k closest training examples in the feature space.<br>
<p/>
<a style="padding: 10px;" src='https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm'>wikpedia</a>

In [9]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.45531019978969506


<p style="color:darkblue;font-size:36px;">Logistic Regression<p/>
<p style="color:darkblue;font-size:24px; padding: 10px; ">a regression model where the dependent variable (DV) is categorical<p/>
<img src='https://upload.wikimedia.org/wikipedia/commons/3/3a/Linear_regression.svg' height="170" width="150">

<a  style="padding: 10px;" src='https://en.wikipedia.org/wiki/Logistic_regression'>wikpedia</a>

In [10]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.8948475289169295


<p style="color:darkblue;font-size:36px; ">Decision Tree<p/>
<p style="color:darkblue;font-size:24px; padding: 10px;">a decision support tool that uses a tree-like graph or model of decisions and their possible consequences<p/>
<a style="padding: 10px;" src='https://en.wikipedia.org/wiki/Decision_tree'>wikpedia</a>

In [12]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.5068349106203995


<p style="color:darkblue;font-size:36px; ">Random Forest<p/>
<p style="color:darkblue;font-size:24px; padding: 10px;">an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees<p/>

<a style="padding: 10px;" src='https://en.wikipedia.org/wiki/Random_forest'>wikpedia</a>

In [14]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# accuracy
accuracy = 0
for i in y_pred:
    if y_pred[i] == y_test[i]:
        accuracy +=1
print('accuracy: ', accuracy/len(y_pred))

accuracy:  0.843322818086225
