In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree
from sklearn.metrics import accuracy_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('stopwords')

In [None]:
df=pd.read_csv("../input/imdb-10000-most-voted-feature-films-041118/movies.csv",encoding = "ISO-8859-1")
df

In [None]:
#reducing multiple Genres to single which will be the first one (Crime,Drama----->Crime)
l=[]
for x in list(df["Genre"]):
        l.append(x.split(",")[0])
df["Genre"]=l
df

In [None]:
#supported Genres , limit is 40 otherwise dropepd
g=list(df["Genre"].unique())
for x in g:
    if(df[df["Genre"]==x].count()[0]<200):
        df.drop(df[df['Genre']==x].index, inplace = True)
        
g=list(df["Genre"].unique())
for x in g:
    print(x,df[df['Genre']==x].count()[0])

In [None]:
#making dictionary genres for simplification 
# Assigning a number to each Genre
genre={}
i=1
for x in g:
    genre[x]=i
    i=i+1
genre

In [None]:
#Getting required columns
df=df[["Genre","Description"]]
nl=[]
for x in list(df["Genre"]):
    nl.append(genre[x])
df["Genre"]=nl
df

In [None]:
def pre_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [None]:
textFeatures = df['Description'].copy()
#Extracting important words that will be used as features
textFeatures = textFeatures.apply(pre_process)

# saving trained data for future use 
#v_input=pd.DataFrame(textFeatures)
#v_input.to_csv("vectorizer_input.csv")

In [None]:
#assigning numbers to features
vectorizer = TfidfVectorizer("english",smooth_idf=True,use_idf=True)
features = vectorizer.fit_transform(textFeatures)

#The vectorized features
mydf= pd.DataFrame(features)
print(mydf.head())

In [None]:
#using smote to balance features
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 101)
X, y = smote.fit_resample(features,df['Genre'])

In [None]:
print("Genre no: \t count") 
for x in list(set(y)):
  print(str(x)+":    \t   ",np.count_nonzero(x==y))

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [None]:
#Training the model
svc = SVC(kernel='linear', gamma=22)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
print(accuracy_score(labels_test,prediction))

# Saving the trained model (if required)
#filename = 'finalized_model.sav'
#pickle.dump(svc, open(filename, 'wb'))

In [None]:
def predict_genre(msg):
    if(len(msg)<100):
        return(" Kindly insert more than 100 characters ")
    msg=pre_process(msg)
    #using same vectorizer object to generate same number of features from given input
    ft=vectorizer.transform([msg])
    n=svc.predict(ft)[0]
    index=list(genre.values()).index(n)
    return list(genre.keys())[index]

In [None]:
# miscellaneous example
msg="a stranger travels into the land of undead where he is not welcomed but he must survive with whatever means taken"
predict_genre(msg)

In [None]:
#Tenet (Action/Sci-fi)
msg="Armed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time. "
predict_genre(msg)

In [None]:
#Mulan(2020) (Action/Adventure)
msg="A girl disguises as a male warrior and joins the imperial army in order to prevent her sick father from being forced to enlist as he has no male heir."
predict_genre(msg)

In [None]:
#Monster Hunter (Action/Fantasy)
msg="When Lt. Artemis and her loyal soldiers are transported to a new world, they engage in a desperate battle for survival against enormous enemies with incredible powers. Feature film based on the video game by Capcom."
predict_genre(msg)

In [None]:
#Hole in the ground(2019) (Horror/Thriller)
msg="One night, Sarah's young son disappears into the woods behind their rural home. When he returns, he looks the same, but his behavior grows increasingly disturbing. Sarah begins to believe that the boy who returned may not be her son at all."
predict_genre(msg)

In [None]:
#Emma (Romance/Drama)
msg="Following the antics of a young woman, Emma Woodhouse, who lives in Georgian- and Regency-era England and occupies herself with matchmaking - in sometimes misguided, often meddlesome fashion- in the lives of her friends and family."
predict_genre(msg)