In [93]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.tokenize.regexp import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['raw_text.csv']


In [63]:
df = pd.read_csv("../input/raw_text.csv")
df.head()

Unnamed: 0,text,label
0,The following content is\nprovided under a Cre...,Calculus
1,"In this sequence of segments,\nwe review some ...",Probability
2,The following content is\nprovided under a Cre...,CS
3,The following\ncontent is provided under a Cre...,Algorithms
4,The following\ncontent is provided under a Cre...,Algorithms


In [64]:
tags = list(df.label.unique())
print(tags)

['Calculus', 'Probability', 'CS', 'Algorithms', 'Diff. Eq.', 'Linear Algebra', 'AI', 'Statistics', 'Math for Eng.', 'Data Structures', 'NLP']


In [65]:
index_to_tags_dict = {i:tag for i,tag in enumerate(tags)}
tags_to_index_dict = {tag:i for i,tag in enumerate(tags)}
print(index_to_tags_dict)
print(tags_to_index_dict)

{0: 'Calculus', 1: 'Probability', 2: 'CS', 3: 'Algorithms', 4: 'Diff. Eq.', 5: 'Linear Algebra', 6: 'AI', 7: 'Statistics', 8: 'Math for Eng.', 9: 'Data Structures', 10: 'NLP'}
{'Calculus': 0, 'Probability': 1, 'CS': 2, 'Algorithms': 3, 'Diff. Eq.': 4, 'Linear Algebra': 5, 'AI': 6, 'Statistics': 7, 'Math for Eng.': 8, 'Data Structures': 9, 'NLP': 10}


In [66]:
bag_of_words = {}
sentence_tokens = []
for text in df.text:
    tokens = WordPunctTokenizer().tokenize(text)
    token_list = []
    for token in tokens:
        token = token.lower()
        if token not in '''!()-[]{};:'"\,<>./?@#$%^&*_~''':
            try:
                int(token)
                continue
            except:
                token_list.append(token)
                bag_of_words[token] = bag_of_words.get(token,0)+1
    sentence_tokens.append(token_list)

print(len(bag_of_words))

23769


In [67]:
# Taking top n words
n = 10000
top_n_words = sorted(bag_of_words.items(), key = lambda item: item[1], reverse = True)[:n]
print(top_n_words)
top_n_words_to_index = {item[0]:i for i,item in enumerate(top_n_words)}
print(top_n_words_to_index)



In [68]:
# Processed Subtitles
subtitles = []
for token_list in sentence_tokens:
    sub = []
    for token in token_list:
        if top_n_words_to_index.get(token,-1) != -1:
            sub.append(token)
    sub = " ".join(sub)
    subtitles.append(sub)
print(subtitles[0])

the following content is provided under a creative commons license your support will help mit opencourseware continue to offer high quality educational resources for free to make a donation or to view additional materials from hundreds of mit courses visit mit opencourseware at ocw mit edu professor so professor jerison is relaxing in london today and sent me in as his substitute again i m glad to the here and see you all again so our agenda today he said that he d already talked about power series and taylor s formula i guess on last week right on friday so i m going to go a little further with that and show you some examples show you some applications and then i have this course evaluation survey that i ll hand out in the last minutes or so of the class i also have this handout that he made that says end of term if you didn t pick this up coming in grab it going out people tend not to pick it up when they walk in i see so grab this when you re going out there s some things missing fr

In [78]:
# Word Binary, Word Count and Tfidf Features for each sentence
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(subtitles)
word_count_features = np.array(X.toarray())
word_binary_features = np.array(word_count_features>0, dtype = int)

vectorizer2 = TfidfVectorizer()
X2 =  vectorizer2.fit_transform(subtitles)
tfidf_features = np.array(X2.toarray())

print(word_count_features.shape)
print(word_binary_features.shape)
print(word_binary_features[0])
print(word_count_features[0])
print(tfidf_features[0])

(860, 9890)
(860, 9890)
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0. 0. 0. ... 0. 0. 0.]


In [84]:
labels = [tags_to_index_dict[label] for label in df.label]
print(labels)

[0, 1, 2, 3, 3, 0, 3, 4, 5, 5, 4, 2, 1, 1, 2, 6, 3, 2, 3, 0, 3, 1, 4, 7, 2, 8, 5, 2, 5, 9, 1, 2, 5, 0, 1, 5, 0, 2, 1, 5, 0, 0, 5, 10, 3, 5, 7, 3, 4, 0, 3, 2, 1, 3, 2, 5, 5, 4, 7, 1, 10, 3, 9, 0, 5, 9, 4, 1, 1, 4, 4, 0, 1, 7, 4, 5, 0, 7, 2, 2, 3, 3, 4, 0, 5, 5, 2, 0, 5, 1, 5, 3, 2, 2, 9, 0, 5, 0, 0, 1, 5, 6, 2, 1, 1, 4, 10, 7, 4, 6, 4, 1, 7, 4, 5, 7, 1, 1, 5, 0, 5, 5, 8, 0, 2, 5, 1, 1, 9, 7, 4, 3, 2, 1, 5, 5, 7, 1, 5, 1, 2, 1, 7, 9, 8, 1, 9, 2, 3, 4, 5, 1, 9, 3, 4, 3, 5, 2, 0, 7, 5, 9, 6, 9, 5, 10, 3, 3, 9, 5, 5, 1, 1, 5, 10, 1, 3, 0, 0, 5, 3, 0, 5, 4, 5, 7, 5, 5, 0, 8, 4, 3, 3, 3, 9, 1, 3, 0, 7, 2, 1, 7, 0, 5, 5, 5, 4, 2, 2, 9, 6, 2, 4, 4, 2, 6, 2, 5, 2, 3, 1, 7, 4, 6, 10, 5, 4, 6, 4, 3, 2, 9, 2, 6, 7, 2, 3, 5, 2, 7, 8, 9, 9, 5, 3, 1, 1, 2, 5, 5, 5, 1, 9, 5, 5, 8, 1, 9, 5, 0, 5, 3, 1, 6, 4, 1, 3, 4, 4, 3, 4, 2, 5, 7, 9, 6, 1, 7, 10, 6, 7, 7, 1, 5, 6, 3, 7, 5, 5, 10, 1, 7, 9, 5, 7, 1, 9, 9, 7, 10, 3, 0, 2, 3, 1, 5, 2, 6, 9, 7, 6, 1, 4, 10, 1, 4, 7, 8, 5, 5, 4, 6, 3, 7, 5, 0, 5, 2, 1, 4,

In [85]:
train = 700

In [98]:
# Multiclass Logistic Classifier for Word Binary Features
clf_logistic_wb = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=10000)
clf_logistic_wb = clf_logistic_wb.fit(word_binary_features[:train], labels[:train])
pred_logistic_wb = clf_logistic_wb.predict(word_binary_features[train:])
accuracy_logistic_wb = np.mean(pred_logistic_wb==labels[train:])*100
print("Accuracy =",accuracy_logistic_wb)

Accuracy = 10.0


In [99]:
# Multiclass SVM Classifier for Word Binary Features
clf_svm_wb = svm.SVC(gamma='scale')
clf_svm_wb = clf_svm_wb.fit(word_binary_features[:train], labels[:train])
pred_svm_wb = clf_svm_wb.predict(word_binary_features[train:])
accuracy_svm_wb = np.mean(pred_svm_wb==labels[train:])*100
print("Accuracy =",accuracy_svm_wb)

Accuracy = 15.0


In [100]:
# Mutliclass Naive Bayes Classifier for Word Binary Features
clf_nb_wb = GaussianNB()
clf_nb_wb = clf_nb_wb.fit(word_binary_features[:train], labels[:train])
pred_nb_wb = clf_nb_wb.predict(word_binary_features[train:])
accuracy_nb_wb = np.mean(pred_nb_wb==labels[train:])*100
print("Accuracy =",accuracy_nb_wb)

Accuracy = 15.0


In [108]:
# Multiclass Logistic Classifier for Word Count Features
clf_logistic_wc = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=2000)
clf_logistic_wc = clf_logistic_wc.fit(word_count_features[:train], labels[:train])
pred_logistic_wc = clf_logistic_wc.predict(word_count_features[train:])
accuracy_logistic_wc = np.mean(pred_logistic_wc==labels[train:])*100
print("Accuracy =",accuracy_logistic_wc)

Accuracy = 13.750000000000002




In [109]:
# Multiclass SVM Classifier for Word Count Features
clf_svm_wc = svm.SVC(gamma='scale')
clf_svm_wc = clf_svm_wc.fit(word_count_features[:train], labels[:train])
pred_svm_wc = clf_svm_wc.predict(word_count_features[train:])
accuracy_svm_wc= np.mean(pred_svm_wc==labels[train:])*100
print("Accuracy =",accuracy_svm_wc)

Accuracy = 15.625


In [110]:
# Mutliclass Naive Bayes Classifier for Word Count Features
clf_nb_wc = GaussianNB()
clf_nb_wc = clf_nb_wc.fit(word_count_features[:train], labels[:train])
pred_nb_wc = clf_nb_wc.predict(word_count_features[train:])
accuracy_nb_wc = np.mean(pred_nb_wc==labels[train:])*100
print("Accuracy =",accuracy_nb_wc)

Accuracy = 9.375


In [111]:
# Multiclass Logistic Classifier for Tfidf Features
clf_logistic_tfidf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', max_iter=2000)
clf_logistic_tfidf = clf_logistic_tfidf.fit(tfidf_features[:train], labels[:train])
pred_logistic_tfidf = clf_logistic_tfidf.predict(tfidf_features[train:])
accuracy_logistic_tfidf = np.mean(pred_logistic_tfidf==labels[train:])*100
print("Accuracy =",accuracy_logistic_tfidf)

Accuracy = 13.750000000000002


In [112]:
# Multiclass SVM Classifier for Word Tfidf Features
clf_svm_tfidf = svm.SVC(gamma='scale')
clf_svm_tfidf = clf_svm_tfidf.fit(tfidf_features[:train], labels[:train])
pred_svm_tfidf = clf_svm_tfidf.predict(tfidf_features[train:])
accuracy_svm_tfidf= np.mean(pred_svm_tfidf==labels[train:])*100
print("Accuracy =",accuracy_svm_tfidf)

Accuracy = 16.875


In [113]:
# Mutliclass Naive Bayes Classifier for Tfidf Features
clf_nb_tfidf = GaussianNB()
clf_nb_tfidf = clf_nb_tfidf.fit(tfidf_features[:train], labels[:train])
pred_nb_tfidf = clf_nb_tfidf.predict(tfidf_features[train:])
accuracy_nb_tfidf = np.mean(pred_nb_tfidf==labels[train:])*100
print("Accuracy =",accuracy_nb_tfidf)

Accuracy = 9.375
