In [1]:
import numpy as np
import pandas as pd

In [2]:
metadata = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")
train_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv")

In [3]:
train_data.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE
1,tony_montana_frodo_baggins_v_rocky_balboa,Felicia Lopez,False,With a cast that reads like the Vogue Oscar pa...,NEGATIVE
2,darth_vader_katniss_everdeen_sorcerer_donnie_d...,Mr. Charles Burgess,True,Creed II does not give us anything but another...,POSITIVE
3,lara_croft_glimmer,Ryan Barrett,False,"I know what you're thinking, but this is no Li...",POSITIVE
4,jason_bourne_surreal_the_terminator_indiana_jones,Alexander Glover,False,Director Fernando Meirelles tells the story wi...,POSITIVE


In [4]:
train_data = train_data.drop_duplicates(subset=["movieid", "reviewerName", "reviewText"])

In [5]:
merged_df = pd.merge(metadata, train_data, on="movieid", how="inner")

In [6]:
merged_df.shape

(521198, 18)

In [7]:
merged_df["ratingContents"] = merged_df["ratingContents"].apply(
    lambda x: str(x)[1:-1].replace(",", "").replace("'", "")
)
merged_df["ratingContents"] = merged_df["ratingContents"].apply(
    lambda x: str(x).replace(",", "").replace("'", "")
)
merged_df["boxOffice"] = merged_df["boxOffice"].apply(
    lambda x: str(x)[:-1].replace("$", "")
)
merged_df["boxOffice"] = pd.to_numeric(merged_df["boxOffice"], errors="coerce")

In [8]:
merged_df = merged_df.drop(
    columns=[
        "movieid",
    ]
)
merged_df.head()

Unnamed: 0,title,audienceScore,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,boxOffice,distributor,soundType,reviewerName,isFrequentReviewer,reviewText,sentiment
0,James Bond Courageous,65.0,PG-13,Injury Images Brief Drug Use Thematic Elements...,2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Leroy Fernandes,31.4,STX Films,,Karen Hamilton,False,This is nowhere near the level of other great ...,POSITIVE
1,James Bond Courageous,65.0,PG-13,Injury Images Brief Drug Use Thematic Elements...,2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Leroy Fernandes,31.4,STX Films,,Jessica Best,False,If you're feeling adventurous and looking for ...,POSITIVE
2,James Bond Courageous,65.0,PG-13,Injury Images Brief Drug Use Thematic Elements...,2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Leroy Fernandes,31.4,STX Films,,Rebekah Gomez,False,"Mercifully, Vincent Cassel provides a strong l...",POSITIVE
3,James Bond Courageous,65.0,PG-13,Injury Images Brief Drug Use Thematic Elements...,2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Leroy Fernandes,31.4,STX Films,,Paul Sullivan,False,Like the storm that destroys the best laid pla...,POSITIVE
4,James Bond Courageous,65.0,PG-13,Injury Images Brief Drug Use Thematic Elements...,2018-06-01,2018-08-21,120.0,"Adventure, Drama, Romance",English,Leroy Fernandes,31.4,STX Films,,Stacey Malone,True,Less a proper coming-of-age movie than a posh ...,NEGATIVE


In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True) #shuffle the rows

In [11]:
features = merged_df.iloc[:, :-1]
labels = merged_df.iloc[:, -1]

In [12]:
labels.value_counts()

POSITIVE    348093
NEGATIVE    173105
Name: sentiment, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.05)

In [15]:
num_pipe = Pipeline(
    [
        ("impute", IterativeImputer(initial_strategy="most_frequent", imputation_order="ascending", max_iter=50)),
        ("scaler", StandardScaler()),
    ]
)

In [16]:
num_feat_train = num_pipe.fit_transform(
    X_train[["audienceScore", "runtimeMinutes", "boxOffice", "audienceScore", "runtimeMinutes"]]
)



In [17]:
from scipy.sparse import csr_matrix, hstack

In [18]:
num_feat_train_csr = csr_matrix(num_feat_train)

In [19]:
oh_pipe = Pipeline(
    [
        ("oh-enc", OneHotEncoder(handle_unknown="ignore")),
    ]
)

oh_enc_cols = oh_pipe.fit_transform(
    X_train[["rating", "genre", "originalLanguage", "distributor", "soundType"]]
)

In [20]:
oh_enc_cols_csr = csr_matrix(oh_enc_cols)

In [21]:
from scipy.sparse import csr_matrix, hstack

X_train["title"] = X_train["title"].fillna("")
title_vec = TfidfVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 3))
title_mat = title_vec.fit_transform(X_train["title"])

X_train["ratingContents"] = X_train["ratingContents"].fillna("")
ratingContents_vec = TfidfVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 4))
ratingContents_mat = ratingContents_vec.fit_transform(X_train["ratingContents"])

X_train["director"] = X_train["director"].fillna("")
director_vec = TfidfVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 3))
director_mat = director_vec.fit_transform(X_train["director"])

X_train["reviewerName"] = X_train["reviewerName"].fillna("")
reviewerName_vec = TfidfVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 3))
reviewerName_mat = reviewerName_vec.fit_transform(X_train["reviewerName"])

X_train["reviewText"] = X_train["reviewText"].fillna("")
reviewText_vec = TfidfVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 4))
reviewText_mat = reviewText_vec.fit_transform(X_train["reviewText"])

print(title_mat.shape)
print(ratingContents_mat.shape)
print(director_mat.shape)
print(reviewerName_mat.shape)
print(reviewText_mat.shape)

result_train = hstack(
    (
        title_mat,
        ratingContents_mat,
        director_mat,
        reviewerName_mat,
        reviewText_mat,
    )
)

(495138, 39156)
(495138, 17103)
(495138, 21254)
(495138, 6720)
(495138, 3850094)


In [22]:
concatenated_matrix_train = hstack(
    (
        result_train, 
        num_feat_train_csr,
        oh_enc_cols_csr,
    )
)

In [23]:
from sklearn.preprocessing import LabelEncoder

l_enc = LabelEncoder()
l_enc.fit(y_train)
y_train_pre = l_enc.transform(y_train)

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
logit = LogisticRegression(
    n_jobs=-1, C=10, solver="sag", max_iter=5000
)

In [26]:
logit.fit(concatenated_matrix_train, y_train_pre)

In [27]:
test_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")

In [28]:
metadata = metadata.drop_duplicates("movieid")

In [29]:
merged_data_test = pd.merge(test_data, 
                       metadata, on="movieid", how="left")

print(merged_data_test.shape)

(55315, 17)


In [30]:
merged_data_test.head()

Unnamed: 0,movieid,reviewerName,isTopCritic,reviewText,title,audienceScore,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,boxOffice,distributor,soundType
0,legend_marty_mcfly_oracle,John Kim,False,Green slowly cranks up the dread with style an...,Legend Marty McFly Oracle,57.0,R,"['Gore', 'Bloody Horror Violence', 'Language T...",2022-10-14,2022-10-14,111.0,"Holiday, Horror, Mystery & thriller",English,Sara Barnett,$64.0M,Universal Pictures,Dolby Atmos
1,terminator_katniss_everdeen_glimmer,Brian Chaney,False,Philip Noyce's direction is elegant and unforc...,Terminator Katniss Everdeen Glimmer,86.0,PG,['Emotional Thematic Material'],2002-11-29,2003-04-15,94.0,"Drama, Adventure",English,Cindy Flander,$6.2M,Miramax Films,"Dolby SR, DTS, Dolby Stereo, Surround, SDDS, D..."
2,james_bond_labyrinth_gollum,Danielle Parker,False,It wouldn't do to say what path Maria ultimate...,James Bond Labyrinth Gollum,62.0,,,,,,Drama,French,Charlotte Bramble,,,
3,v_quest_han_solo_wondrous,Brittany Lane,False,Pig is not exactly the arthouse John Wick that...,V Quest Han Solo Wondrous,84.0,,,2021-07-16,2021-08-03,92.0,"Drama, Mystery & thriller",English,Jean Gainer,$3.1M,Neon,
4,enigma_hulk_surreal_starlight,Justin Willis,False,An imaginative no-budget musical of sorts abou...,Enigma Hulk Surreal Starlight,,,,,,66.0,"Drama, Musical",Arabic,Marvin Short,,,


In [31]:
merged_data_test["ratingContents"] = merged_data_test["ratingContents"].apply(
    lambda x: str(x)[1:-1].replace(",", "").replace("'", "")
)
merged_data_test["ratingContents"] = merged_data_test["ratingContents"].apply(
    lambda x: str(x).replace(",", "").replace("'", "")
)
merged_data_test["boxOffice"] = merged_data_test["boxOffice"].apply(
    lambda x: str(x)[:-1].replace("$", "")
)
merged_data_test["boxOffice"] = pd.to_numeric(merged_data_test["boxOffice"], errors="coerce")

In [32]:
num_feat_test = num_pipe.transform(
    merged_data_test[["audienceScore", "runtimeMinutes", "boxOffice", "audienceScore", "runtimeMinutes"]]
)

In [33]:
num_feat_test_csr = csr_matrix(num_feat_test)

In [34]:
oh_enc_cols_test = oh_pipe.transform(
    merged_data_test[["rating", "genre", "originalLanguage", "distributor", "soundType"]]
)

In [35]:
oh_enc_cols_test_csr = csr_matrix(oh_enc_cols_test)

In [36]:
merged_data_test["title"] = merged_data_test["title"].fillna("")
title_mat_test = title_vec.transform(merged_data_test["title"])

merged_data_test["ratingContents"] = merged_data_test["ratingContents"].fillna("")
ratingContents_mat_test = ratingContents_vec.transform(merged_data_test["ratingContents"])

merged_data_test["director"] = merged_data_test["director"].fillna("")
director_mat_test = director_vec.transform(merged_data_test["director"])

merged_data_test["reviewerName"] = merged_data_test["reviewerName"].fillna("")
reviewerName_mat_test = reviewerName_vec.transform(merged_data_test["reviewerName"])

merged_data_test["reviewText"] = merged_data_test["reviewText"].fillna("")
reviewText_mat_test = reviewText_vec.transform(merged_data_test["reviewText"])

print(title_mat_test.shape)
print(ratingContents_mat_test.shape)
print(director_mat_test.shape)
print(reviewerName_mat_test.shape)
print(reviewText_mat_test.shape)

result_test = hstack(
    (
        title_mat_test,
        ratingContents_mat_test,
        director_mat_test,
        reviewerName_mat_test,
        reviewText_mat_test,
    )
)

(55315, 39156)
(55315, 17103)
(55315, 21254)
(55315, 6720)
(55315, 3850094)


In [37]:
result_test.shape

(55315, 3934327)

In [38]:
concatenated_matrix_test = hstack(
    (
        result_test, 
        num_feat_test_csr,
        oh_enc_cols_test_csr,
    )
)

In [39]:
pred = logit.predict(concatenated_matrix_test)

In [40]:
prediction_df = pd.DataFrame(zip(range(len(pred)), pred), columns=["id", "sentiment"])
prediction_df

Unnamed: 0,id,sentiment
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
55310,55310,1
55311,55311,1
55312,55312,0
55313,55313,1


In [41]:
prediction_df["sentiment"] = prediction_df["sentiment"].apply(lambda x : "POSITIVE" if x==1 else "NEGATIVE")
prediction_df

Unnamed: 0,id,sentiment
0,0,POSITIVE
1,1,POSITIVE
2,2,POSITIVE
3,3,POSITIVE
4,4,POSITIVE
...,...,...
55310,55310,POSITIVE
55311,55311,POSITIVE
55312,55312,NEGATIVE
55313,55313,POSITIVE


In [42]:
prediction_df.to_csv("submission.csv", index=False)