### Part 1: NLP

Solve spam sms detection problem using Gensim word2vec. You can use algorithm of your choice to train and evaluate the model.

In [84]:
import pandas as pd
import numpy as np 
import gensim 
from sklearn.model_selection import  train_test_split
import nltk 
from nltk.corpus import stopwords
import re

from google.colab import files
files.upload()

data = pd.read_csv("spam.csv", encoding='latin-1')
data = data.iloc[:,:2]

data.columns = ["label","text"]
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [85]:
X = data.iloc[:,1]
Y = data.iloc[:,0]

Y = Y.map({"ham":0, "spam":1})  # to perform binary classification 
Y = np.array(Y).reshape(-1,1)
Y

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [86]:
nltk.download("stopwords")
words = [text.split() for text in X]
stop_words = set(stopwords.words('english'))

text = X.copy()
text = text.str.lower().str.split()
#print(text)

for index, stop in enumerate(stop_words):
  text = text.apply(lambda x : [word for word in x if word not in stop])

#print(text)
texts = text.to_list()
data['stopped'] = texts
data

data['cleaned'] = [', '.join(map(str, x)) for x in data['stopped']]
#print(data['cleaned'])

for i,msg in enumerate(data['cleaned']):
  data['cleaned'].iloc[i] = msg.replace(",","")
  #print(msg)

#data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
X = [line for line in data['cleaned']]
Y = np.array(Y).reshape(-1,1)

#X

In [88]:

x_train, x_test, y_train, y_test =  train_test_split(X, Y, test_size=0.2)
print(x_train)
#print(y_train)
words = [line.split() for line in x_train]
print(words)
#print(len(y_test))
w2vec = gensim.models.Word2Vec(size=200, min_count=1, window=3,workers=8)
w2vec.build_vocab(words)



In [90]:
keys = w2vec.wv.vocab.keys()
print(keys)
vocab_size = len(keys)
print("Vocab size", vocab_size)

w2vec.train(words,total_examples=len(words),epochs=30)

Vocab size 11151


(1183298, 1232370)

In [91]:
w2vec.wv.most_similar("role")

[('people.', 0.9993878602981567),
 ('rather', 0.9993140697479248),
 ('cat', 0.9992601871490479),
 ('rock', 0.99924236536026),
 ('girls', 0.9992402791976929),
 ('damn', 0.9992326498031616),
 ('meant', 0.9992159605026245),
 ('right.', 0.9992154240608215),
 ('sister', 0.999212384223938),
 ('later?\\""', 0.9991198778152466)]

In [94]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()

tokenizer.fit_on_texts(x_train)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

from keras.preprocessing.sequence import pad_sequences
xtrain = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=300)
xtest = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=300)

print(xtrain)

Total words 7897
[[   0    0    0 ...   98  518    4]
 [   0    0    0 ... 3721  817 1130]
 [   0    0    0 ...  818  164 3722]
 ...
 [   0    0    0 ...  788 7896   38]
 [   0    0    0 ...    7    6  321]
 [   0    0    0 ...   28   36  144]]


In [95]:
from sklearn.metrics import accuracy_score
from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(xtrain,y_train)
prediction = model.predict(xtest)
accuracy_score(y_test, prediction)

0.9085201793721973

### Part 2: Content Based Recommender

Build a system that recommends movies that are similar to a particular movie

In [None]:
# Load 'overview' feature in metadata dataset

In [96]:
from google.colab import files
files.upload()

import pandas as pd

movie_data = pd.read_csv("movies_metadata.csv", low_memory=False)
movie_data.shape

(45466, 24)

In [97]:
# Use your judgement to preprocess data 

In [99]:
movie_data = movie_data.dropna()        #dropping all rows where atlease one value is NA....I dropped all rows with NA as I wanted a smaller subset of data because I was getting memmory error
movie_data.shape

(693, 24)

In [100]:
# Construct TF-IDF matrix

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1, 3),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movie_data['overview'])
tfidf_matrix.shape

(693, 46596)

In [None]:
# Compute cosine similarity score between movies

In [102]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

movie_data = movie_data.reset_index()
print(movie_data)
#indices = movie_data['index'].toint()
titles = movie_data['title']
print(titles)
# finding indices of every title
movies = pd.Series(movie_data.index, index=titles)
print("movies ", movies)


     index  adult  ... vote_average vote_count
0        9  FALSE  ...          6.6     1194.0
1       68  FALSE  ...          7.0      513.0
2       69  FALSE  ...          6.9     1644.0
3      153  FALSE  ...          6.8       28.0
4      178  FALSE  ...          5.2      153.0
..     ...    ...  ...          ...        ...
688  44274  FALSE  ...          6.7     1675.0
689  44674  FALSE  ...          6.0       35.0
690  44821  FALSE  ...          6.0      144.0
691  44842  FALSE  ...          6.2     1440.0
692  45029  FALSE  ...          6.8        5.0

[693 rows x 25 columns]
0                                    GoldenEye
1                                       Friday
2                          From Dusk Till Dawn
3                             Blue in the Face
4      Mighty Morphin Power Rangers: The Movie
                        ...                   
688             War for the Planet of the Apes
689                Goon: Last of the Enforcers
690              Pokémon: Spell of 

In [None]:
# Take movie title as input and output 10 most similar movies

In [103]:
movie = input("Write a movie name to find 10 similar movies : ")

idx = movies[movie]
print("Index ",idx)
sim_scores = list(enumerate(cosine_sim[idx]))

sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
sim_scores = sim_scores[1:11]
movie_indices = [i[0] for i in sim_scores]

print("Recommended Movies")

output = []
count = 1
for item in titles.iloc[movie_indices]:
  output.append(item)
  print(count," -> ",item)
  count += 1

Write a movie name to find 10 similar movies : Friday
Index  1
Recommended Movies
1  ->  Escape from L.A.
2  ->  Cross
3  ->  30 Days of Night: Dark Days
4  ->  Constantine
5  ->  Blade Runner
6  ->  Ouija: Origin of Evil
7  ->  Sharknado
8  ->  Terminator Genisys
9  ->  Klown
10  ->  Before Sunset
