# Natural Language Processing

In [22]:
# importing required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from google.colab import drive
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.ensemble import GradientBoostingClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# mounting google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# loading sms data
data = pd.read_csv('/content/drive/My Drive/ML TA/Demo 9/spam.csv', encoding='latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [26]:
# checking the distribution of the target variable
data["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
# pre processing text
def text_pre_processing(sms):
    
    # removing punctuations
    sms_wo_punct = [x for x in sms if x not in string.punctuation]
    sms_wo_punct = ''.join(sms_wo_punct)
    
    # keeping only english letters
    sms_wo_punct_only_words = [x for x in sms_wo_punct if x.isalpha() or x == " "]
    sms_wo_punct_only_words = ''.join(sms_wo_punct_only_words)
    
    # converting the data to lower case
    sms_wo_punct_only_words_lc = sms_wo_punct_only_words.lower()
    
    # removing stopwords
    sms_wo_punct_only_words_lc = sms_wo_punct_only_words_lc.split(" ")
    sms_wo_punct_only_words_lc_wo_sw = [x for x in sms_wo_punct_only_words_lc if x not in stopwords.words('english')]
    sms_wo_punct_only_words_lc_wo_sw = ' '.join(sms_wo_punct_only_words_lc_wo_sw)
    
    return sms_wo_punct_only_words_lc_wo_sw

In [14]:
# pre processed sms
data_processed = pd.DataFrame(columns = ["pre_processed_sms"])
for i in tqdm(data["v2"]):
    data_processed = data_processed.append({"pre_processed_sms": text_pre_processing(i)}, ignore_index = True)
data_processed

100%|██████████| 5572/5572 [00:25<00:00, 220.99it/s]


Unnamed: 0,pre_processed_sms
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts st...
3,u dun say early hor u c already say
4,nah dont think goes usf lives around though
...,...
5567,nd time tried contact u u å pound prize clai...
5568,ì b going esplanade fr home
5569,pity mood soany suggestions
5570,guy bitching acted like id interested buying s...


In [16]:
# Splitting into train and test
x_train, x_test, y_train, y_test = train_test_split(data_processed, data["v1"], test_size = 0.20, random_state = 100)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [18]:
# Using word to vec
alg = Word2Vec(size = 500, min_count = 1)
alg.build_vocab(x_train)
alg.train(x_train, total_examples = len(x_train), epochs = alg.epochs)

(5, 85)

In [19]:
# converting words to vector
def convert_word_to_vector(sms, size):
    vec = np.zeros(500).reshape((1, 500))
    count = 0
    for word in sms:
        try:
            vec += alg[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [21]:
# converting training and testing to vectors
x_train_vecs = scale(np.concatenate([convert_word_to_vector(z, 500) for z in x_train["pre_processed_sms"]]))
x_test_vecs = scale(np.concatenate([convert_word_to_vector(z, 500) for z in x_test["pre_processed_sms"]]))

  import sys
  import sys


In [23]:
# training a gradient boosting classifier
clf = GradientBoostingClassifier(n_estimators = 100, max_depth = 3, random_state = 100)
clf.fit(x_train_vecs, y_train)

GradientBoostingClassifier(random_state=100)

In [24]:
# training confusion matrix
y_train_pred = clf.predict(x_train_vecs)
confusion_matrix(y_train,y_train_pred)

array([[3808,   47],
       [ 236,  366]])

In [25]:
# testing confusion matrix
y_test_pred = clf.predict(x_test_vecs)
confusion_matrix(y_test,y_test_pred)

array([[946,  24],
       [ 79,  66]])

# Recommender Systems

In [27]:
# loading the dataset
df = pd.read_csv('/content/drive/My Drive/ML TA/Demo 9/movies_metadata.csv')
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,[],"[{'iso_3166_1': 'IR', 'name': 'Iran'}]",,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [31]:
# checking percentage null for each columns
round(100*(df.isnull().sum()/len(df.index)), 2)

adult                     0.00
belongs_to_collection    90.12
budget                    0.00
genres                    0.00
homepage                 82.88
id                        0.00
imdb_id                   0.04
original_language         0.02
original_title            0.00
overview                  2.10
popularity                0.01
poster_path               0.85
production_companies      0.01
production_countries      0.01
release_date              0.19
revenue                   0.01
runtime                   0.58
spoken_languages          0.01
status                    0.19
tagline                  55.10
title                     0.01
video                     0.01
vote_average              0.01
vote_count                0.01
dtype: float64

In [33]:
# checking the information about each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [35]:
# creating a budget - revenue column to calculate profit
# but as visible above, the budget column is of object type which is needed to be converted to float first
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45463 non-null  float64
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [36]:
df['profit'] = df['budget'] - df['revenue']

In [37]:
# checking the top profit movies
df = df.sort_values('profit', ascending = False)
df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,profit
21175,False,,255000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://disney.go.com/the-lone-ranger/,57201,tt1210819,en,The Lone Ranger,The Texas Rangers chase down a gang of outlaws...,12.7291,/b4vil5ueYJNBNypHmo1tpuevh4z.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2013-07-03,89289910.0,149.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Never Take Off the Mask,The Lone Ranger,False,5.9,2361.0,165710090.0
14823,False,,150000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",http://www.thewolfmanmovie.com/,7978,tt0780653,en,The Wolfman,"Lawrence Talbot, an American man on a visit to...",9.29721,/igiSz8bGHE0BegZ8xvlokAQgAk3.jpg,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'US', 'name': 'United States o...",2010-02-11,0.0,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,When the moon is full the legend comes to life,The Wolfman,False,5.5,562.0,150000000.0
32849,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",150000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",http://www.pokemon-movie.jp/,350499,tt4503906,ja,ポケモン・ザ・ムービーXY 光輪の超魔神 フーパ,"Ash, Pikachu, and their friends come to a dese...",3.176182,/y5DFVX0QTZoVkPB3RFeudufW5fM.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': 'O...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2015-07-18,0.0,73.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,A Power Unbound. A Battle of Legends.,Pokémon the Movie: Hoopa and the Clash of Ages,False,6.2,39.0,150000000.0
43190,False,,125000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",http://www.hbo.com/band-of-brothers,331214,tt0185906,en,Band of Brothers,Drawn from interviews with survivors of Easy C...,7.903731,/yRXTVpDRBA3983C3HjoY0SO4dV6.jpg,"[{'name': 'HBO', 'id': 6068}]","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2001-09-09,0.0,705.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,Ordinary men. Extraordinary times.,Band of Brothers,False,8.2,725.0,125000000.0
27656,False,,120000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.hbo.com/the-pacific/index.html,189197,tt0374463,en,The Pacific,"A 10-part mini-series from the creators of ""Ba...",6.09224,/xV7FKNqOwnO3aJSiRM8WCrwdRS8.jpg,"[{'name': 'Playtone', 'id': 4171}, {'name': 'H...","[{'iso_3166_1': 'US', 'name': 'United States o...",2010-03-15,0.0,540.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Hell was an ocean away,The Pacific,False,7.9,318.0,120000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,- Written by Ørnås,0.065736,,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Midnight Man,False,6.0,1,,,,,,,,,,
29502,False,"{'id': 122661, 'name': 'Mardock Scramble Colle...",0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",http://m-scramble.jp/exhaust/,122662,tt2423504,ja,マルドゥック・スクランブル 排気,Third film of the Mardock Scramble series.,,,,,,,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Mardock Scramble: The Third Exhaust,False,7.0,12,,,,,,,,,,
35586,False,,0.0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 28,...",,249260,tt2622826,en,Avalanche Sharks,A group of skiers are terrorized during spring...,,,,,,,,,,,,,,,
