In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load the training data
train_data = pd.read_csv('/content/train_data.txt', sep=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')

train_data.columns =['ID','Title','Gerne','Description']
train_data.dropna(inplace=True)
train_data.head()

Unnamed: 0,ID,Title,Gerne,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [None]:
# Load the Test Data data

test_data = pd.read_csv('/content/test_data.txt', sep=':::', header=None, names=['ID', 'TITLE', 'DESCRIPTION'], engine='python')

test_data.columns =['ID','Title','Description']
test_data.dropna(inplace=True)
test_data.head()

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [None]:
# Load the Test Solution data

test_solution = pd.read_csv('/content/test_data_solution.txt',  sep=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')

test_solution.columns =['ID','Title','Gerne','Description']
test_solution.dropna(inplace=True)
test_solution.head()

Unnamed: 0,ID,Title,Gerne,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [None]:
# Combine train and test data for TF-IDF vectorization
all_data = pd.concat([train_data, test_data], axis=0)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=30000)  # You can adjust max_features as needed

# Fit and transform the descriptions
tfidf_matrix = tfidf_vectorizer.fit_transform(all_data['Description'])

# Split the data back into train and test sets
train_tfidf = tfidf_matrix[:len(train_data)]
test_tfidf = tfidf_matrix[len(train_data):]

# Convert genre labels to numerical values
genre_mapping = {genre: idx for idx, genre in enumerate(train_data['Gerne'].unique())}
train_data['GENRE_NUM'] = train_data['Gerne'].map(genre_mapping)


In [None]:
# Train a Logistic Regression model
lr_classifier = LogisticRegression(max_iter=10000, random_state=42)
lr_classifier.fit(train_tfidf, train_data['GENRE_NUM'])

In [None]:
# Predict genres for test data
test_predictions = lr_classifier.predict(test_tfidf)

In [None]:
# Convert numerical predictions back to genre labels
predicted_genres = {num: genre for genre, num in genre_mapping.items()}

# Evaluate accuracy using test_solution data
test_solution['PREDICTED_GERNE'] = [predicted_genres[prediction] for prediction in test_predictions]
accuracy = accuracy_score(test_solution['Genre'], test_solution['PREDICTED_GERNE'])

print(f"Accuracy: {accuracy*100:.2f}")


In [None]:
# User input for predicting a movie genre
user_input = input("Enter a movie description: ")
user_tfidf = tfidf_vectorizer.transform([user_input])
user_prediction = lr_classifier.predict(user_tfidf)[0]
user_predicted_genre = predicted_genres[user_prediction]

print(f"Predicted Genre: {user_predicted_genre}")

Enter a movie description: cupid
Predicted Genre:  documentary 


In [None]:
# User input for predicting a movie genre
user_input = input("Enter a movie description: ")
user_tfidf = tfidf_vectorizer.transform([user_input])
user_prediction = lr_classifier.predict(user_tfidf)[0]
user_predicted_genre = predicted_genres[user_prediction]

print(f"Predicted Genre: {user_predicted_genre}")

KeyboardInterrupt: ignored