In [None]:
import pandas as pd
import numpy as np
import requests

In [None]:
df = pd.read_csv('/content/movies.csv')

In [None]:
df.shape

(9980, 4)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,genres
0,0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']"
2,2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Drama', 'Crime']"
3,3,Schindler's List,The true story of how businessman Oskar Schind...,"['Drama', 'History', 'War']"
4,4,12 Angry Men,The defense and the prosecution have rested an...,['Drama']


# Removing Extra Column #

In [None]:
df.drop(['Unnamed: 0'],inplace = True,axis=1)

In [None]:
df.head()

Unnamed: 0,title,overview,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Drama', 'Crime']"
3,Schindler's List,The true story of how businessman Oskar Schind...,"['Drama', 'History', 'War']"
4,12 Angry Men,The defense and the prosecution have rested an...,['Drama']


# Performing NLP Steps

## Step 1 : Lower Casing

In [None]:
df['title'] = df['title'].str.lower()
df['overview'] = df['overview'].str.lower()
df['genres'] = df['genres'].str.lower()

In [None]:
df.sample(10)

Unnamed: 0,title,overview,genres
9071,three steps over heaven,a bad boy and a socialite girl hook-up.,"['drama', 'romance']"
2476,crash,"in post-sept. 11 los angeles, tensions erupt w...",['drama']
5445,underworld: evolution,as the war between the vampires and the lycans...,"['fantasy', 'action', 'thriller']"
2786,santa sangre,a former circus artist escapes from a mental h...,"['thriller', 'drama', 'horror']"
693,train to busan,when a zombie virus pushes korea into a state ...,"['horror', 'thriller', 'action', 'adventure']"
8664,rv,climbing aboard their mammoth recreational veh...,"['family', 'comedy', 'romance', 'drama']"
5379,damage,the life of a respected british politician at ...,"['drama', 'romance']"
4504,in the valley of elah,a career officer and his wife work with a poli...,"['history', 'drama', 'thriller', 'crime', 'mys..."
8472,"new york, i love you","new york, i love you delves into the intimate ...","['comedy', 'drama', 'romance']"
3281,creed ii,between personal obligations and training for ...,"['drama', 'action']"


## Step 2 : Check for Unnesecarry Data & Remove it ##

In [None]:
import re

In [None]:
df['overview'].str.contains(r"<.*?>", regex=True).sum()

0

In [None]:
df['overview'].str.contains(r"http[s]?://", regex=True).sum()

0

In [None]:
## There are No Html Tags or Any Url

## Step 3 : Remove Punctuations ( ! , ? , \ )

In [None]:
df['title'] = df['title'].str.replace(r'[^\w\s]', '', regex=True)
df['overview'] = df['overview'].str.replace(r'[^\w\s]', '', regex=True)
df['genres'] = df['genres'].str.replace(r'[^\w\s]', '', regex=True)

In [None]:
df.head()  # Now our Data is cleaned from punctuations

Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,drama crime
1,the godfather,spanning the years 1945 to 1955 a chronicle of...,drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime
3,schindlers list,the true story of how businessman oskar schind...,drama history war
4,12 angry men,the defense and the prosecution have rested an...,drama


## Step 4 : Spelling Correction

In [None]:
from spellchecker import SpellChecker


In [None]:
spell = SpellChecker()

In [None]:
def correct_sentence(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    corrected = [spell.correction(w) if w in spell else w for w in words]
    return " ".join(corrected)


In [None]:
df['overview'] = df['overview'].apply(correct_sentence)

In [None]:
df.head()  # All the Spellings Are Now Corrected

Unnamed: 0,title,overview,genres
0,the shawshank redemption,imprisoned in the 1940s for the double murder ...,drama crime
1,the godfather,spanning the years 1945 to 1955 a chronicle of...,drama crime
2,the godfather part ii,in the continuing saga of the corleone crime f...,drama crime
3,schindlers list,the true story of how businessman oskar schind...,drama history war
4,12 angry men,the defense and the prosecution have rested an...,drama


# Step 5 : Removing Stop Words ( a , and , of )

In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords


In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    filtered = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered)


In [None]:
df ['overview'] = df['overview'].apply(remove_stopwords)

In [None]:
df['overview']  # Now All the Stop Words Are Removed

Unnamed: 0,overview
0,imprisoned 1940s double murder wife lover upst...
1,spanning years 1945 1955 chronicle fictional i...
2,continuing saga corleone crime family young vi...
3,true story businessman oskar schindler saved t...
4,defense prosecution rested jury filing jury ro...
...,...
9975,small midwestern town prepares annual christma...
9976,one thousand years cataclysmic events forced h...
9977,ongoing war canine feline species put hold joi...
9978,cia agent interned failing kill international ...


## Step 6 : Tokenization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize


In [None]:
def tokenize_text(text):
    if not isinstance(text, str):  # handle NaN
        return []
    return word_tokenize(text)

In [None]:
df['overview'] = df['overview'].apply(tokenize_text)
df['genres'] = df['genres'].apply(tokenize_text)

In [None]:
df.sample(5)

Unnamed: 0,title,overview,genres
7662,the stranger,"[two, strangers, strike, conversation, long, j...","[thriller, crime, drama]"
3285,a fantastic woman,"[marinas, life, thrown, turmoil, following, de...",[drama]
1696,irma la douce,"[naive, policeman, falls, love, prostitute, do...","[romance, comedy]"
764,manhattan,"[manhattan, explores, life, middleaged, televi...","[comedy, drama, romance]"
2964,sapphire blue,"[gwen, discovered, shes, final, member, secret...","[fantasy, romance, drama]"


## Step 7 : Stemming

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

In [None]:
df['overview'] = df['overview'].apply(lemmatize_tokens)

In [None]:
df['genres'] = df['genres'].apply(lemmatize_tokens)

In [None]:
df.head()

Unnamed: 0,title,overview,genres
0,the shawshank redemption,"[imprisoned, 1940s, double, murder, wife, love...","[drama, crime]"
1,the godfather,"[spanning, year, 1945, 1955, chronicle, fictio...","[drama, crime]"
2,the godfather part ii,"[continuing, saga, corleone, crime, family, yo...","[drama, crime]"
3,schindlers list,"[true, story, businessman, oskar, schindler, s...","[drama, history, war]"
4,12 angry men,"[defense, prosecution, rested, jury, filing, j...",[drama]
