In [1]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("C:\codsoft\wiki_movie_plots_deduped.csv")

In [3]:
data

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34881,2014,The Water Diviner,Turkish,Director: Russell Crowe,Director: Russell Crowe\r\nCast: Russell Crowe...,unknown,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [4]:
end_time = time.time()

In [6]:
subset_size = 10000  # Can change Subset size
subset_data = data.sample(n=subset_size, random_state=42)

In [7]:
X = subset_data['Plot']
y = subset_data['Genre']

In [8]:
print("Splitting data into train and test sets...")
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
end_time = time.time()
print("Data split in {:.2f} seconds".format(end_time - start_time))

Splitting data into train and test sets...
Data split in 0.00 seconds


In [9]:
print("Creating TF-IDF vectorizer...")
start_time = time.time()
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # can change max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
end_time = time.time()
print("TF-IDF vectorization done in {:.2f} seconds".format(end_time - start_time))

Creating TF-IDF vectorizer...
TF-IDF vectorization done in 3.07 seconds


In [10]:
print("Training Logistic Regression model...")
start_time = time.time()
logreg_model = LogisticRegression(max_iter=1000, n_jobs=-1)  # Can change parameters
logreg_model.fit(X_train_tfidf, y_train)
end_time = time.time()
print("Logistic Regression training done in {:.2f} seconds".format(end_time - start_time))

Training Logistic Regression model...
Logistic Regression training done in 259.95 seconds


In [11]:
print("Predicting genres...")
start_time = time.time()
y_pred = logreg_model.predict(X_test_tfidf)
end_time = time.time()
print("Prediction done in {:.2f} seconds".format(end_time - start_time))

Predicting genres...
Prediction done in 0.18 seconds


In [12]:
print("Evaluating model...")
start_time = time.time()
report = classification_report(y_test, y_pred, zero_division=1)


Evaluating model...


In [13]:

print(report)
end_time = time.time()
print("Evaluation done in {:.2f} seconds".format(end_time - start_time))

                                                                     precision    recall  f1-score   support

                                                          usa, can        1.00      0.00      0.00         1
                                                              [144]       1.00      0.00      0.00         1
                                                             action       0.33      0.02      0.03        64
                                                 action / adventure       1.00      0.00      0.00         1
                                        action / adventure / comedy       1.00      0.00      0.00         1
                                                     action / drama       1.00      0.00      0.00         2
                                             action / drama / crime       1.00      0.00      0.00         1
                                                   action / fantasy       1.00      0.00      0.00         1
                  