# Text Classification with IMDB Movie's description

<center><img src="./img/homepage.png" style="max-height: 400px; max-width: auto;"/></center>

## IMDB Web Scraping

For more information please open 

## Data Preprocessing

### Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import csv, re, random

In [2]:
corpus_file = 'movie_corpus.csv'

In [3]:
value_reduced = lambda x: x if None != x and len(str(x)) > 0 else None

In [4]:
dataset = []
with open(corpus_file) as f:
    movie_corpus_reader = csv.DictReader(f, delimiter='|')
    fields = movie_corpus_reader.fieldnames
    
    for movie in movie_corpus_reader:
        if movie['genre'] in ("", None) or 0 == len(movie['genre']):
            continue
            
        if movie['description'] in ('', None) or 0 == len(movie['description']):
            continue
            
        movie['year'] = str(movie['year']) if str(movie['year']).isnumeric() and 4 == len(str(movie['year'])) else None
        movie = {k: value_reduced(v) for k,v in movie.items() if 'metascore' != k}
        dataset.append(movie)
        

### Split dataset

In [5]:
# random_seed = int(random.random() * 1e+10)
random_seed = 2958053999

In [6]:
train_dataset, test_dataset = train_test_split(
    dataset,
    random_state=random_seed,
    shuffle=True
)

stratisfy_train = {'title': [m['title'] for m in train_dataset]}
stratisfy_train['genre'] = [m['genre'] for m in train_dataset]

stratisfy_test = {'title': [m['title'] for m in test_dataset]}
stratisfy_test['genre'] = [m['genre'] for m in test_dataset]

In [7]:
train_dataset[2]

{'title': 'Snabba cash II',
 'year': '2012',
 'movie_rate': None,
 'runtime': '99',
 'genre': 'Action, Crime, Drama',
 'rating': '6.3',
 'description': 'Three years later, JW gets out of prison, but soon finds himself between the contending parties of his criminal past.',
 'directors': 'Babak Najafi+Bruce Axl Argeadson',
 'starts': 'Joel Kinnaman+Matias Varela+Dragomir Mrsic+Fares Fares'}

In [8]:
X_train = pd.DataFrame(train_dataset)
y_train = pd.DataFrame(stratisfy_train)

X_test = pd.DataFrame(test_dataset)
y_test = pd.DataFrame(stratisfy_test)

In [None]:
X_train.head()

In [None]:
movie_corpus[['genre']]

### Extract movie's genre

In [9]:
def movie_genre_extraction(corpus):

    movie_genre = sorted(list(set([genre.strip() 
                  for genres in corpus[~corpus.genre.isna()]['genre'].tolist()
                  for genre in genres.split(', ')])))
    
    return [k for k in movie_genre if len(k)]

def update_for_genre(genre, corpus):
    for genre in movie_genre:
        corpus[genre] = corpus[~corpus.isna()]['genre'].apply(
            lambda x: 1 if 'str' == type(x).__name__ and genre in x else 0
        )
    
    return corpus

def text_to_float(g):
    
    gross = 0

    if 'float' == type(g).__name__:
        gross = g
        print(gross)
    else:
        gross = float(g.lower().replace('$', '').replace('m', ''))
        
    return gross

def covert_gross(corpus):
    return corpus['gross'][~corpus['gross'].isnull()].apply(text_to_float)

### Convert movie genre to one-hot encoding

In [10]:
train_genre = movie_genre_extraction(y_train)
for genre in train_genre:
    y_train[genre] = y_train.genre.apply(
        lambda x: 1 if genre in x else 0
    )
    
del y_train['genre']
y_train.head()

Unnamed: 0,title,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,Padre no hay más que uno,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Marvin ou la belle éducation,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Snabba cash II,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Save Yourself,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,Anoko wa kizoku,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Split out title and description

In [12]:
description_train = X_train[['title', 'description']]

## Text Analytics

In [None]:
import nltk
from nltk.corpus import stopwords

### Download stopwords (Required at first run)

In [None]:
import ssl

In [None]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

Dialog below will be shown after cell aboved was executed. All downloaded resource store in `~/nltk_data`

<center><img src="./img/nltk-package.png" /></center>

<center><img src="./img/nltk-package-downloaded.png" /></center>

In [None]:
print(stopwords.words('english'))

### NLP Example

Objective: We need to tokenized movies description we collected from IMDB with task

- [ ] Remove all special characters such as `,`, `.` or `;` even numbers from description
- [ ] Tokenize movie description
- [ ] Count tokens' frequency
- [ ] Convert to lower characters
- [ ] Remove stop words

### Experiment process

#### Tokenize

In [None]:
from nltk import word_tokenize
from collections import Counter

In [None]:
# sentence = re.sub('[^A-Za-z ]+', '', movie_corpus.description[0].lower())
sentence = movie_corpus.description[0].lower()

In [None]:
# Example
movie_corpus.description[0]
sentence = movie_corpus.description[0].lower()

tokenized = word_tokenize(sentence)
tokenized

### Remove all stopwords

In [None]:
english_stopwords = frozenset(stopwords.words('english'))

In [None]:
tokenized_set = set(tokenized)

remaining = tokenized_set - english_stopwords
remaining

In [None]:
tf = Counter(remaining)
tf

In [None]:
class classification(object):
    def __init__(self, f):
        self.f = f
        
    def __call__(self):
        self.f()

In [None]:
def text_tokenize(description) -> str:
    import re
    return re.sub('[^A-Za-z ]+', '', description.lower())

def counting(description, stopwords=None):
    if None == stopwords:
        from nltk import stopwords
        stopwords = frozenset(stopwords.words('english'))