In [1]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from nltk.stem import WordNetLemmatizer
from scipy.sparse import lil_matrix
from sklearn.svm import SVC,LinearSVC
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/train_features.tsv", sep = '\t')
val_data = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/valid_features.tsv", sep = '\t')
test_data = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/test_features.csv")
test_data_final = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/test_features.csv")

In [3]:
train_labels = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/train_labels.tsv", sep = '\t')
val_labels = pd.read_csv("/Users/varunsai/Desktop/IML/Assignment-2/data/valid_labels.tsv", sep = '\t')

In [4]:
train_data.drop(['movieId','YTId'], axis=1, inplace=True)
val_data.drop(['movieId','YTId'], axis=1, inplace=True)
test_data.drop(['movieId','YTId'], axis=1, inplace=True)

In [5]:
train_data['tag']=train_data['tag'].apply(lambda word:re.sub('[^a-zA-Z]',' ',word))
val_data['tag']=val_data['tag'].apply(lambda word:re.sub('[^a-zA-Z]',' ',word))
test_data['tag']=test_data['tag'].apply(lambda word:re.sub('[^a-zA-Z]',' ',word))

In [6]:
train_data['title']=train_data['title'].apply(lambda word:re.sub('[^a-zA-Z]',' ',str(word)))
val_data['title']=val_data['title'].apply(lambda word:re.sub('[^a-zA-Z]',' ',str(word)))
test_data['title']=test_data['title'].apply(lambda word:re.sub('[^a-zA-Z]',' ',str(word)))

In [7]:
train_labels = train_labels['genres']
val_labels = val_labels['genres']

In [8]:
wl = WordNetLemmatizer()

def stem_tokens(inp):
    words = [wl.lemmatize(word) for word in inp.split()]
    return words

In [9]:
word_vectorizer = TfidfVectorizer(tokenizer=stem_tokens, norm='l1')

In [10]:
train_tag = word_vectorizer.fit_transform(train_data['tag'].tolist())
val_tag = word_vectorizer.transform(val_data['tag'].tolist())
test_tag = word_vectorizer.transform(test_data['tag'].tolist())
#word_vectorizer.get_feature_names()

In [11]:
train_title = word_vectorizer.fit_transform(train_data['title'].tolist())
val_title = word_vectorizer.transform(val_data['title'].tolist())
test_title = word_vectorizer.transform(test_data['title'].tolist())

In [12]:
train_tag_df = pd.DataFrame(train_tag.toarray())
val_tag_df = pd.DataFrame(val_tag.toarray())
test_tag_df = pd.DataFrame(test_tag.toarray())

In [13]:
train_title_df = pd.DataFrame(train_title.toarray())
val_title_df = pd.DataFrame(val_title.toarray())
test_title_df = pd.DataFrame(test_title.toarray())

In [14]:
train_data.drop(['title','tag','year'], axis=1, inplace=True)
val_data.drop(['title','tag','year'], axis=1, inplace=True)
test_data.drop(['title','tag','year'], axis=1, inplace=True)

In [15]:
train_data = pd.concat([train_data,train_tag_df,train_title_df], axis=1)
val_data = pd.concat([val_data,val_tag_df,val_title_df], axis=1)
test_data = pd.concat([test_data,test_tag_df,test_title_df], axis=1)

In [16]:
def df_to_csr(df):
    arr = lil_matrix(df.shape, dtype=np.float32)
    for i, col in enumerate(df.columns):
        ix = df[col] != 0
        arr[np.where(ix), i] = 1

    return arr.tocsr()

y_train = train_labels
X_train = df_to_csr(train_data)

y_val = val_labels
X_val = df_to_csr(val_data)

X_test = df_to_csr(test_data)

In [17]:
#Multinomial Naive Bayes

In [18]:
nb = MultinomialNB()

In [19]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
prediction_val_nb = nb.predict(X_val)

In [21]:
prediction_test_nb = nb.predict(X_test)

In [22]:
metrics = classification_report(prediction_val_nb, y_val)

In [23]:
print(metrics)

              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         0
   Adventure       0.00      0.00      0.00         0
   Animation       0.00      0.00      0.00         0
    Children       0.00      0.00      0.00         0
      Comedy       0.53      0.36      0.43        56
       Crime       0.00      0.00      0.00         0
 Documentary       0.06      1.00      0.11         1
       Drama       0.72      0.23      0.35       132
     Fantasy       0.22      1.00      0.36         4
   Film_Noir       0.00      0.00      0.00         0
      Horror       0.00      0.00      0.00         0
     Musical       0.00      0.00      0.00         0
     Mystery       0.00      0.00      0.00         0
     Romance       0.35      0.35      0.35        51
      Sci_Fi       0.62      0.71      0.67        14
    Thriller       0.50      0.36      0.42        39
         War       0.10      1.00      0.17         2
     Western       0.00    

In [24]:
#Support vector machine

In [25]:
svm = SVC(kernel="linear",decision_function_shape='ovo')

In [26]:
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
prediction_val_svm = svm.predict(X_val)

In [28]:
prediction_test_svm = svm.predict(X_test)

In [29]:
metrics = classification_report(prediction_val_svm, y_val)

In [30]:
print(metrics)

              precision    recall  f1-score   support

      Action       0.17      0.50      0.25         2
   Adventure       0.50      0.20      0.29         5
   Animation       0.33      0.33      0.33         3
    Children       0.33      0.11      0.17         9
      Comedy       0.45      0.39      0.41        44
       Crime       0.20      0.20      0.20         5
 Documentary       0.61      0.55      0.58        20
       Drama       0.53      0.32      0.40        71
     Fantasy       0.22      0.33      0.27        12
   Film_Noir       0.00      0.00      0.00         2
      Horror       0.38      0.30      0.33        10
     Musical       0.10      0.12      0.11         8
     Mystery       0.11      0.29      0.16         7
     Romance       0.33      0.36      0.35        47
      Sci_Fi       0.56      0.64      0.60        14
    Thriller       0.21      0.20      0.21        30
         War       0.38      0.80      0.52        10
     Western       0.00    

In [31]:
#Gradient Boosting Classifier

In [32]:
gbc = GradientBoostingClassifier()

In [33]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [34]:
prediction_val_gbc = gbc.predict(X_val)

In [35]:
prediction_test_gbc = gbc.predict(X_test)

In [36]:
metrics = classification_report(prediction_val_gbc, y_val)

In [37]:
print(metrics)

              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         1
   Adventure       0.00      0.00      0.00         3
   Animation       0.33      0.33      0.33         3
    Children       0.00      0.00      0.00         0
      Comedy       0.47      0.49      0.48        37
       Crime       0.40      0.33      0.36         6
 Documentary       0.44      0.53      0.48        15
       Drama       0.51      0.21      0.30       105
     Fantasy       0.33      0.60      0.43        10
   Film_Noir       0.25      0.20      0.22         5
      Horror       0.38      0.75      0.50         4
     Musical       0.20      0.25      0.22         8
     Mystery       0.22      0.67      0.33         6
     Romance       0.25      0.36      0.30        36
      Sci_Fi       0.69      0.69      0.69        16
    Thriller       0.32      0.36      0.34        25
         War       0.43      0.50      0.46        18
     Western       0.00    

In [38]:
#Decision tree classifier

In [39]:
dt = DecisionTreeClassifier()

In [40]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [41]:
prediction_val_dt = dt.predict(X_val)

In [42]:
prediction_test_dt = dt.predict(X_test)

In [43]:
metrics = classification_report(prediction_val_dt, y_val)

In [44]:
print(metrics)

              precision    recall  f1-score   support

      Action       0.00      0.00      0.00         4
   Adventure       0.00      0.00      0.00         3
   Animation       0.00      0.00      0.00         1
    Children       0.00      0.00      0.00         3
      Comedy       0.45      0.47      0.46        36
       Crime       0.20      0.25      0.22         4
 Documentary       0.39      0.64      0.48        11
       Drama       0.42      0.30      0.35        60
     Fantasy       0.28      0.31      0.29        16
   Film_Noir       0.25      1.00      0.40         1
      Horror       0.12      0.12      0.12         8
     Musical       0.10      0.11      0.11         9
     Mystery       0.00      0.00      0.00         4
     Romance       0.43      0.36      0.39        61
      Sci_Fi       0.69      0.50      0.58        22
    Thriller       0.50      0.35      0.41        40
         War       0.38      0.57      0.46        14
     Western       0.00    

In [54]:
#Output for kaggle test
#prediction_kaggle variable can take test prediction of any of the above model.

In [50]:
prediction_kaggle = pd.DataFrame(prediction_test_gbc)

In [51]:
test_data_final['genres'] = prediction_kaggle

In [52]:
kaggle = pd.DataFrame(test_data_final,columns = ["movieId", "genres"])

In [53]:
kaggle.to_csv("/Users/varunsai/Desktop/kaggle.csv",index=None)