# EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from etl import safe_literal_eval, movies_df, credits_parquet

### Let's recover the fields vote_count and voten average from the CSV movie file.

In [None]:
movies_csv = pd.read_csv("data/movies_dataset.csv")

In [None]:
movies_df = movies_df.merge(movies_csv[["id","vote_count", "vote_average"]], on="id", how="left")

### Missing Data

In [None]:
movies_df.isna().sum()

Having a look at original language field

In [None]:
movies_df[movies_df["original_language"].isna()].head()

And those rows with null values in field original_language also has missing values on other field sudh as vote_count, vote_average, etc.. having 3 features missing is a significant for our EDa.. so that the more suitable approach is drop them

In [None]:
movies_df.dropna(subset=["original_language"], inplace=True)

### runtime feature missing values

In [None]:
movies_df[movies_df["runtime"].isna()].head()

Some features like revenue, budget, in consecuents roi (shor for return_on_investment), and some vote_count values in zero. Since this is few missing data, the more suitable approach is dropping those rows.

In [None]:
movies_df.dropna(subset=["runtime"], inplace=True)

In [None]:
movies_df.isna().sum()

### collection_id feature
collection_id features is an exception here since this refers to more information about certain movies that are related to other ones (there's another dataset from etl that can be imported to merge on this id). A more appropriate feature would be adding a new feture that only indicates if it bellongs to a collection.

In [None]:
movies_df["bellongs_to_collection"] = ~movies_df["collection_id"].isnull()

In [None]:
movies_df.isnull().sum()

By now, we will allow the collection_id feature for the EDA, and may be drop it latter when will be time for training the recomendation model

# Hypothesis

I would like to know how predominant the language english is in the movies. Parhaps thet would be an outlier. And there are two things about languages: the original language wich the movie was produced and the languages availability for each movie. Let's have a look at this proportions

In [None]:
movies_df[["original_language", "spoken_languages"]]

In [None]:
movies_df["spoken_languages"]

This is odd, can you see in the visualization of the cell above the language "tl"? this iso 639 1 code does not have a name.. Let's take a look at those rows by creating a booleant mask for "tl"

In [None]:
tl_language_mask = movies_df["spoken_languages"].apply(lambda languages: 
                                                       # This only evauates this: Is "tl" in this generator
                                                       # The generator can be readen as this: for each disctionary object language in this list of languages,
                                                       # throughs the iso_639_1 key in the generator
                                                       "tl" in (language["iso_639_1"] for language in languages))

In [None]:
movies_df[tl_language_mask].tail()

and it seems that "tl" shows up as a original_language for any Philipine movie... remind this, in the cell above, we are filtering against spoken_language feature.

And after researching at google, it seems "tl" iso 3166 1 code is related to the language Tagalog. A language spoken by a quarter of the population of the Philpines. Click (or tap) on [here](https://en.wikipedia.org/wiki/Tagalog_language) for further information. Let's fill this data.

In [None]:
(   # The movies dataframe filtered only spoken languages that contains the iso_639_1 code "tl", 
    # then grab the spoken language field only
    movies_df.loc[tl_language_mask, "spoken_languages"]
    # Then apply this lambda expression
    .apply(lambda 
           # In the records wich contains a list of dictionaries
           languages: 
           # Evaluate this list comprehention, where for each language in languages, build the dictionary {"iso_639_1": "XXX", "name": "XXX"} 
           # Having the conditionals, in the dictionary constructor, "name" key is going to be "Tagolog" if the "iso_639_1" code evaluates True the value "tl"
           # else, just write the original value.
           [{"iso_639_1": language["iso_639_1"], "name" : "Tagolog" if language["iso_639_1"] == "tl" else language["name"]} 
            for language in languages]
            )
         )

In [None]:
movies_df.loc[tl_language_mask,"spoken_languages"] =  (movies_df.loc[tl_language_mask, "spoken_languages"]
                                    .apply(lambda languages: [
                                        {"iso_639_1": language["iso_639_1"], "name" : "Tagolog" if language["iso_639_1"] == "tl" else language["name"]} 
                                        for language in languages]))

In [None]:
# Lets check the result..
movies_df[tl_language_mask].head()

In [None]:
# And let's check the final output.. great, it seems we have already impute Tangolog correctly
movies_df.head()

but... this only makes me think that probably the "tl" iso 639 a code may not be the only one.. let's check if there may be other name in blank

In [None]:
movies_df["spoken_languages"].apply(lambda languages: "" in (language["name"] for language in languages)).sum() 

Yep, it is.. and let's see what languages does not have name..

The pandas chaining bellow makes this:

Filter all rows whose field spoken_languages avualutes an empty string ("") in the generator of each language object key name.

Then, from the list of languages of each row, give me only the iso 639 1 code whose name is "". That list give me the first element, then give me only unique values

In [None]:
(

    movies_df
    # Mask with .loc[] paradigm
    .loc[movies_df["spoken_languages"]
         # In the spocken_languages field:
         # The expresion for the validation mask applies this : Is "" in this generator for languages name?
         # And then grab the "spoken_language field" from the movies_df
         .apply(lambda languages: "" in (language["name"] for language in languages)), "spoken_languages"] # ----------> the mask

    # After getting the rows with "" in any key "name" of the dictionary, just create a list with "name" key == ""
    # then only convert the list to a single string by just indexing its only value.
    .apply(lambda languages: [language["iso_639_1"] for language in languages if language["name"] == ""][0]) # --------> the extraction on iso_639_1
# Throws unique values
).unique() # --> and this give us the iso_639_1 codes without name..

Ok.. there were sort of few ones.. running the same code for each language and repeating it over and over will be a drainfull task.. I just asked chatgpt to map those iso 639 1 codes to its language names. And then, we are going to iterate this dictionary to fill the name on those empty keys and that's it

In [None]:
iso_language_mapping = {
    'gd': 'Scottish Gaelic',
    'mn': 'Mongolian',
    'mk': 'Macedonian',
    'kw': 'Cornish',
    'nv': 'Navajo',
    'mi': 'Maori',
    'yi': 'Yiddish',
    'ne': 'Nepali',
    'km': 'Khmer',
    'iu': 'Inuktitut',
    'bo': 'Tibetan',
    'ty': 'Tahitian',
    'si': 'Sinhala',
    'as': 'Assamese',
    'sh': 'Serbo-Croatian',
    'gn': 'Guarani',
    'lo': 'Lao',
    'xh': 'Xhosa',
    'cr': 'Cree',
    'ku': 'Kurdish',
    'hy': 'Armenian',
    'oc': 'Occitan',
    'to': 'Tongan',
    'ce': 'Chechen',
    'qu': 'Quechua',
    'am': 'Amharic',
    'tg': 'Tajik',
    'tt': 'Tatar',
    'se': 'Northern Sami',
    'ml': 'Malayalam',
    'co': 'Corsican',
    'dz': 'Dzongkha',
    'ht': 'Haitian Creole',
    'ln': 'Lingala',
    'my': 'Burmese',
    'sa': 'Sanskrit',
    'fy': 'Western Frisian',
    'tk': 'Turkmen',
    'ny': 'Chichewa',
    'sc': 'Sardinian',
    'gu': 'Gujarati',
    'mr': 'Marathi',
    'ug': 'Uighur',
    'ay': 'Aymara',
    'st': 'Southern Sotho',
    'jv': 'Javanese',
    'br': 'Breton',
    'sg': 'Sango',
    'lb': 'Luxembourgish',
    'ab': 'Abkhazian',
    'sm': 'Samoan',
    'ki': 'Kikuyu',
    'tn': 'Tswana',
    'fo': 'Faroese',
    'sn': 'Shona',
    'bi': 'Bislama',
    'ig': 'Igbo',
    'mh': 'Marshallese'
}

In [None]:
for key, value in iso_language_mapping.items():
    language_mask = movies_df["spoken_languages"].apply(lambda languages: key in (language["iso_639_1"] for language in languages))
    movies_df.loc[language_mask, "spoken_languages"] = (movies_df.loc[language_mask, "spoken_languages"]
                                        .apply(lambda languages: [
                                        {"iso_639_1": language["iso_639_1"], "name" : value if language["iso_639_1"] == key else language["name"]} 
                                        for language in languages]))

Let's check if it worked by searching for those lanuages whose name was this --> ""

In [None]:
movies_df[movies_df["spoken_languages"].apply(lambda languages: "gd" in (language["iso_639_1"] for language in languages))].loc[108, "spoken_languages"]

May be, this task had to be in our ETL proccess but recall the main goal of the latter ETL script: end points creation

# Exploring Data Set

#### Now, let's see how numbers are distributed as well as the features entries in the dataset

In [None]:
movies_df.describe()

In [None]:
movies_df.describe(exclude="number")

#### For each numerical feature, count the number of unique entries

In [None]:
unique_numerical_values = movies_df.select_dtypes(include="number").nunique().sort_values()

unique_numerical_values.plot.bar(logy=True, figsize=(15, 4), title="Unique values per feature");

#### Missing Values

In [None]:
plt.figure(figsize=(10,8))
plt.imshow(movies_df.isna(), aspect="auto",
           interpolation="nearest", cmap="gray")


In [None]:
import missingno as msno
msno.matrix(movies_df, labels=True, sort="descending");

In [None]:
movies_df.isna().mean().sort_values().plot(
    kind="bar", figsize=(15,4),
    title="Parcentage of missing values per feature",
    ylabel="Ratio of missing value per feature"
);

### Feature Distribution

This will give us an idea how values are distributed across each feature


The right next below figure shows the distrution of the all features that aren't numerical, meaning all categorical features

In [None]:
fig, axes = plt.subplots(ncols=1, nrows=3, figsize=(12,8))
movies_non_numerical = movies_df.select_dtypes(exclude=["number","datetime"])

for col, ax in zip(movies_non_numerical, axes.ravel()):



    movies_non_numerical[col].value_counts().plot(

        logy=True, title=col, lw=0, marker=".", ax=ax
    )

plt.tight_layout();

### Numerical Features Distribution

In [None]:
movies_df.hist(bins=25,
               figsize=(15, 5),
               layout=(-1, 5),
               edgecolor="black")
plt.tight_layout()

In [None]:
most_freq_entry = movies_df.mode()

df_freq = movies_df.eq(most_freq_entry.values, axis=1).mean().sort_values(ascending=False)

display(df_freq.head())

df_freq.plot.bar(figsize=(15,4));

### Visualizing Ooutliers or Undisirable Records

In [None]:
movies_df.plot(
    lw=0, marker=".", subplots=True,
    layout=(-1,4), markersize=5, figsize=(15,15)
);

It seems we have some points that stand out from the usual range. For example, which movie last above 1200 minutes? lets have a look at it

In [None]:
movies_df["runtime"].describe()

In [None]:
movies_df[movies_df["runtime"] > 1200] 

And certanly Centenial title is not a movie, it is a serie.

But in spite of this row -Centennial.. what kind of movie last more than 4 hours?

In [None]:
movies_df.loc[movies_df["runtime"] > 240, "runtime"].sort_values(ascending=False).hist();

In [None]:
movies_df.loc[movies_df["runtime"].isin(range(240, 400)), "runtime"].hist(bins=15);

In [None]:
movies_df.loc[movies_df["runtime"] > 400, "runtime"].hist();

In [None]:
movies_df[movies_df["runtime"] > 1000]

In [None]:
movies_df[movies_df["runtime"].isin(range(400, 1000))][["title", "release_year", "runtime"]]

Let's search a random title from this filtering and see if it is a movie.. for example the title [Empire](https://www.filmaffinity.com/es/film975551.html) and this is kind of intereting that, in fact, it is a movie... a movie of 485 minutes... a really low rated one by the way.. this could qualify as outliear because it is not a serie.

It is also shown titles such as "The Godfather Trilogy: 1972-1990"... is that supposed to be a movie? it seems to be a collection, but in that case, there is a field for collections already.


As you can see, spotting outliers is a draining task if you were to search for each record to figure out if it qqualifies as a an outlier... Luckly, there are numerical approach to get done this task.


One quick and straightforward option is the boxplot and it is a graph that shows an Interquartile Range (IQR) that in simple words, it divides the data into 4 intervals of the distribution of the standard deviation

In [None]:
import seaborn as sns
sns.boxenplot(data=movies_df)

In [None]:
sns.boxenplot(data=movies_df["budget"])

In [None]:
sns.boxenplot(data=movies_df["revenue"])

In [None]:
sns.boxenplot(data=movies_df["runtime"])

In [None]:
sns.boxenplot(data=movies_df["original_language"].value_counts())


In [None]:
from scipy.stats import norm

def plot_gauss_belt(data_1d):

    mean = np.mean(data_1d)
    std_dev = np.std(data_1d)


    Q1 = np.percentile(data_1d, 25)
    Q3 = np.percentile(data_1d, 75)

    x_values = np.linspace(mean - 3 * std_dev, mean + 3 * std_dev, 100)

    gaussian = norm.pdf(x_values, mean, std_dev)

    plt.plot(x_values, gaussian, label= str(data_1d.name + " Distribution"))
    plt.axvline(mean, color='b', linestyle='--', label='mean')
    plt.axvline(Q1, color='r', linestyle='--', label='Q1')
    plt.axvline(Q3, color='g', linestyle='--', label='Q3')
    plt.xlabel('Value')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.show();

In [None]:
plot_gauss_belt(movies_df["runtime"])

In [None]:
plot_gauss_belt(movies_df["budget"])

In [None]:
plot_gauss_belt(movies_df["revenue"])

In [None]:
plot_gauss_belt(movies_df["vote_average"])

In [None]:
plot_gauss_belt(movies_df["vote_count"])

Certainly there are a few outliers such as budget and revenue where there is some movies really far away of the mean.. may be that unbalance points are explained by  the big capital resourses that hollywood industry has, at least a way more than any other independent movie. 

May be the data set contains a signicant quantity of independent movies and perhaps the best approach will be clustrization in order to create categories such as "is this an a independent production movie?" and maybe that will allow us to get rid of the usage of these features, but this is only an assumption, and we do not have to fall into the temptation of suppose.. let's better have a look at values again.

A good question is: how many records in field budget and revenue are zero? having zeros is like having nulls in this case... we need enough data to describe each individual movie. I will copy the dataframe in case of changning something, I'll have no regrest of spoiling something

In [None]:
movies_draft = movies_df.copy()

In [None]:
(movies_draft["budget"] == 0).sum() / movies_draft.shape[0] * 100 # around 80% of this feature equals zero..

In [None]:
(movies_draft["revenue"] == 0).sum() / movies_draft.shape[0] * 100 # 83% ... 

ok... definetely, these feature (and in consecuetion return_on_investment) wont be usefull for our machine learning model.. jum... but, this is part of the exploration in the data set, it is nt only about dropping null values and tada: your data is clean and ready for anything you want to do with it.. (may be only clean).. it is more about what makes sense.. and considering these features for our ML model, wont make any sense.. anyway.. let's see what other feature could we get.

### Vote Averge, Vote Count, and Runtime

We've already seen how run time values are distributed... and we found that there are some rows that aren't even a movie.. also, we saw that there are movies whsoe runtime is above 7 hours sush as Empire of Andy Warhol or Hitler: A Film from Germany.. they are outliers and they wont be considered for our porpuse. And recalling the gauss standard distribution figure.. the Q3 is almost above 100 minutes runtime.. so we have to drop movies above 200 mintutes runtime

In [None]:
plot_gauss_belt(movies_df["runtime"])

In [None]:
(movies_df["runtime"] >  200).sum() # only 338 movies to drop..

In [None]:
movies_df.shape[0] - movies_df.loc[~(movies_df["runtime"] >  200)].shape[0] # --> this is a simple validation that I am substracting correctly movie above 200 minutues runtime

In [None]:
movies_df = movies_df.loc[~(movies_df["runtime"] >  200)]

And let's see again how results the gauss figure

In [None]:
plot_gauss_belt(movies_df["runtime"])

And.. what about movies whose runtime es bellow 30 mintues.. that's the runtime for a single episode of a ny serie

In [None]:
movies_df[movies_df["runtime"] < 30]

In [None]:
# This filter get rid off all movies runtime equals zero and any production country as "Unknown".. yep, it is kind of tricky pandas chaning, but I earn this because I did not want to normalize this feature :)
movies_df[(movies_df["runtime"] == 0) 
          & (movies_df["production_countries"]
             # If there were any "Unknown" in "iso_3166_1" in the generator, it evaluates True
             .apply(lambda countries: "Unknown" in (country["iso_3166_1"] for country in countries)))] 

In [None]:
# And the inverse mirror view of the above filtering is this

movies_df = movies_df[~((movies_df["runtime"] == 0) 
            & (movies_df["production_countries"]
                # If there were any "Unknown" in "iso_3166_1" in the generator, it evaluates True
                .apply(lambda countries: "Unknown" in (country["iso_3166_1"] for country in countries))))] 

In [None]:
# Ok.. now I realize that this approach works perfect for trimming data std deviation distribution accross gauss belt by just trimming range of values.
plot_gauss_belt(movies_df.loc[movies_df["runtime"].isin(range(45, 120)), "runtime"])
# Withdrawn values in this trim:
print("Withdrawn values in this trim: " + str(movies_df.loc[~movies_df["runtime"].isin(range(45, 120))].shape[0])) 

ok.. it seems this will be usefull for the rest of fetures.. let's conver it into function

In [None]:
def trimming_gaus_belt_values(dataframe: pd.DataFrame, feature: str, low_limit: int|float, up_limit: int|float):
    """
    Trim values outside a specified range in a Gaussian distribution belt and visualize the trimmed distribution.
    
    Parameters:
        dataframe (pd.DataFrame): The DataFrame containing the data.
        feature (str): The name of the column in the DataFrame representing the feature to trim.
        low_limit (int|float): The lower limit of the Gaussian distribution belt.
        up_limit (int|float): The upper limit of the Gaussian distribution belt.
        
    Returns:
        None
        
    This function trims the values in the specified feature column of the DataFrame that are outside the range
    [low_limit, up_limit]. It then plots the trimmed distribution using the plot_gauss_belt function and prints
    the number of withdrawn values (values outside the specified range).
    """
    
    # this works perfect for trimming data std deviation distribution accross gauss belt by just trimming range of values.
    plot_gauss_belt(dataframe.loc[dataframe[feature].isin(range(low_limit, up_limit)), feature])
    # Withdrawn values in this trim:
    print("Withdrawn values in this trimed range" + str((low_limit, up_limit)) + ": " 
          + str(dataframe.loc[~dataframe[feature].isin(range(low_limit, up_limit))].shape[0]) + "\n"
          + "Being the "
          + str(round(dataframe.loc[~dataframe[feature].isin(range(low_limit, up_limit))].shape[0] / dataframe.shape[0] * 100, 2))
          + " % of the data" + "\n"
          + "The std: " + str(np.std(dataframe.loc[dataframe[feature].isin(range(low_limit, up_limit)), feature]))
          ) 

In [None]:
movies_df["runtime"].describe()

In [None]:
for runtime_value in [25, 50, 75, 100]:
    trimming_gaus_belt_values(movies_df, feature="runtime", low_limit=runtime_value, up_limit=240)

In [None]:
# Vote average, it is commun vote ranges 0 to 10 or 0 to 5.. but above 10? I we can be sure that is not a reliable value
movies_df["vote_average"].describe() # --> ok.. it seems everything ok here..

In [None]:
movies_df["vote_count"].describe() # --> from 0 vores to 14,075 votes.. let's visualize the trimming of the outliers

In [None]:
# This is the std deviation distribution without any trimming
plot_gauss_belt(movies_df["vote_count"])

In [None]:
# Trimming the low limit
for vote_count_value in [1, 10, 50, 150]:
    trimming_gaus_belt_values(movies_df, feature="vote_count", low_limit=vote_count_value, up_limit=900)

In [None]:
# Trimming the upper limit
for vote_count_value in [800, 1600, 2400, 3200]:
    trimming_gaus_belt_values(movies_df, feature="vote_count", low_limit=1, up_limit=vote_count_value)

The data points of feature vote_count are really far away from the mean on average... values are highly spread out... this data will make dificult to train a ML model.. at least we are looking for a lightweight model and calculation needs value consistency

But I am curious about wich movie has the most of vote_count

In [None]:
movies_df.loc[movies_df["vote_count"].idxmax()] # Inception, a such good movie by the bay.. this result makes sense..

So.. very intuitively, just by figures, my conclutions:

* I will keep this trade off on runtime feature:

        Withdrawn values in this trimed range(25, 240): 1932 
        Being the 4.38 % of the data
        The std: 21.40339410277729

* Keep vote_average feature as originall, no changes needed.

* Get rid of vote_count feature.. its dispertion is tooo high.. I wont drop it yet.



In [None]:
movies_df = movies_df[movies_df["runtime"].isin(range(25, 240))]

In [None]:
movies_df

### Getting more features:

Let's explore the characters.. One hypothesis is that depending on the actor weights in decition making.. that's only an hypothesis... I mean, I really like every movie where Gary Oldman is a character in the cast.. Leon: The Professional, The Dark Knight, what a good movies...  

And maybe, this step had to be in the ETL script... but I didn't include because I wanted to keep it in the lightest way. The code bellow is commented because this is only executed by my computer in order to have the parquet file needed in the repo

In [None]:
# # Importing data
# credits_csv = pd.read_csv("data/credits.csv")
# # Checking nulls
# credits_csv["cast"].isnull().sum()
# # Getting objects
# credits_csv["cast"] = credits_csv["cast"].apply(safe_literal_eval)
# # Extracting names only
# credits_csv["cast"] = credits_csv["cast"].apply(lambda characters: [{"name": character["name"]} for character in characters])
# # Dropping useless field
# credits_csv.drop(columns=["crew"], inplace=True)

In [None]:
# # Loading into the parquet_data folder repo
# credits_csv.to_parquet("parquet_data/cast_eda_dataset.parquet")

In [None]:
characters_df = pd.read_parquet("parquet_data/cast_eda_dataset.parquet")

In [None]:
import sys
sys.getsizeof(characters_df) / 1000000

In [None]:
characters_df["id"] = characters_df["id"].astype(str)

In [None]:
characters_df["cast"] = characters_df["cast"].apply(lambda actors: [actor["name"] for actor in actors])

In [None]:
characters_df

In [None]:
characters_df.merge(movies_df, on="id", how="inner")

In [None]:
movies_df = characters_df.merge(movies_df, on="id", how="inner")

In [None]:
movies_df.shape

## Including feature overview. 


This feature is kind of large... and it was dropped because including it would not meet the memory constraints of the enviroment for render deployment

In [None]:
movies_csv["overview"].isnull().sum()

In [None]:
movies_csv.dropna(subset=["overview"], inplace=True)

In [None]:
overviews_df = movies_csv[["id", "overview"]]

In [None]:
movies_df = movies_df.merge(overviews_df, on="id", how="left")

In [None]:
movies_df.shape

In [None]:
movies_df.head(2)

### Including Genres feature

Here we only are going to include de field genre that are in the movies csv file, all of them with its corresponding  movie in a single row... meaning that here we are still going to work with nested data, and handling it by .apply() and lambda expressions

In [None]:
# Just to keep track of the dataframe shape
movies_df.shape

In [None]:
movies_csv["genres"].isnull().sum()

In [None]:
movies_csv["genres"] = movies_csv["genres"].apply(safe_literal_eval)

In [None]:
movies_csv["genres"] = (movies_csv["genres"]
                        
                        # List only genres names
                        .apply(lambda genres: [genre["name"] for genre in genres])
                        
                        # Convert that listo into a single string by gluing them by ", "
                        .apply(lambda genres: ", ".join(genres)))

In [None]:
genres_df = movies_csv[["id", "genres"]]

In [None]:
genres_df.shape

In [None]:
movies_df = movies_df.merge(genres_df, on="id", how="left")

In [None]:
movies_df.shape

### Formating cast feature

I'll also change the feature name to actors

In [None]:
movies_df["cast"] = movies_df["cast"].apply(lambda actors: ", ".join(actors))

In [None]:
movies_df.rename(columns={"cast": "actors"}, inplace=True)

### Creating the Content for TF-ADF model

In [None]:
movies_df.head(1)

In [None]:
movies_df["content"] = movies_df["title"] + " released in " + movies_df["release_year"].astype(str) + " Overview: " + movies_df["overview"] + ". Actors in the movie: " + movies_df["actors"] + ". Movie's genres: " + movies_df["genres"]

In [None]:
data_for_recomender = movies_df[["title", "content"]]

In [None]:
len(data_for_recomender["content"][0])

In [None]:
# data_for_recomender.to_parquet("parquet_data/data_for_recommender.parquet")

In [None]:
import psutil

process = psutil.Process()

print(f"Memory used: {process.memory_info().rss / 1024 / 1024:.2f} MB")

# Build of Recomendation Model

This space is to iterate the model and monitor memory consumption. Once get an a suitable model, time to import the parquet data and create the .py file for the FastAPI application

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [2]:
data = pd.read_parquet("parquet_data/data_for_recommender.parquet")

In [3]:
data_sample = data.sample(2000, random_state=10)

In [4]:
data_sample.reset_index(drop=True, inplace=True)

In [5]:
indeces = pd.Series(data_sample.index, index=data_sample["title"]).drop_duplicates()

In [6]:
tfidf = TfidfVectorizer(stop_words="english")

In [7]:
tfidf_matrix = tfidf.fit_transform(data_sample["content"].fillna(""))

In [8]:
tfidf_matrix

<2000x30921 sparse matrix of type '<class 'numpy.float64'>'
	with 121742 stored elements in Compressed Sparse Row format>

In [9]:
cosine_similarities_ = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
import sys
sys.getsizeof(cosine_similarities_) / 1000000

32.000128

In [11]:
import psutil
process = psutil.Process()

print(f"Memory used: {process.memory_info().rss / 1024 / 1024:.2f} MB")

Memory used: 224.27 MB


# Model 1

In [12]:
def get_recommendations(df, column, value, cosine_similarities, limit=10):
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.
    
    Args:
        df (object): Pandas dataframe containing the text data. 
        column (string): Name of column used, i.e. 'title'. 
        value (string): Name of title to get recommendations for, i.e. 1982 Ferrari 308 GTSi For Sale by Auction
        cosine_similarities (array): Cosine similarities matrix from linear_kernel
        limit (int, optional): Optional limit on number of recommendations to return. 
        
    Returns: 
        Pandas dataframe. 
    """
    
    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(df.index, index=df[column]).drop_duplicates()

    # Get the index for the target value
    target_index = indices[value]

    # Get the cosine similarity scores for the target value
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:limit+1]

    # Extract the tuple values
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)    

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actutal recommendations
    recommendations = df[column].loc[recommendation_indices]

    # Return a dataframe
    df = pd.DataFrame(list(zip(index, recommendations, scores)), 
                      columns=['index','recommendation', 'cosine_similarity_score']) 

    return df

# Model 2

In [13]:
# def get_recommendations(df, column, value, cosine_similarities, limit=10):
#     """
#     Return a dataframe of content recommendations based on TF-IDF cosine similarity.

#     Args:
#         df (object): Pandas dataframe containing the text data.
#         column (string): Name of column used, i.e. 'title'.
#         value (string): Name of title to get recommendations for, i.e. Toy Story
#         cosine_similarities (array): Cosine similarities matrix from linear_kernel
#         limit (int, optional): Optional limit on number of recommendations to return.

#     Returns:
#         Pandas dataframe.
#     """

#     # Create a dictionary to map values to indices
#     indices = {val: idx for idx, val in enumerate(df[column].unique())}

#     # Get the index for the target value
#     target_index = indices[value]

#     # Get the cosine similarity scores for the target value
#     cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

#     # Sort the cosine similarities in order of closest similarity
#     cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

#     # Return tuple of the requested closest scores excluding the target item and index
#     cosine_similarity_scores = cosine_similarity_scores[1:limit+1]

#     # Get the indices for the closest items
#     recommendation_indices = [i[0] for i in cosine_similarity_scores]

#     # Get the actual recommendations
#     recommendations = df[column].loc[recommendation_indices].tolist()

#     # Get the indices and scores
#     index = [x[0] for x in cosine_similarity_scores]
#     scores = [x[1] for x in cosine_similarity_scores]

#     # Create the dataframe
#     df = pd.DataFrame({'index': index, 'recommendation': recommendations, 'cosine_similarity_score': scores})

#     return df


In [14]:
get_recommendations(df=data_sample, column="title", value="Toy Story", cosine_similarities=cosine_similarities_)

Unnamed: 0,index,recommendation,cosine_similarity_score
0,326,Toy Story 2,0.467124
1,628,Paris-Manhattan,0.126984
2,1741,Radio Days,0.108983
3,1841,Mr. Warmth: The Don Rickles Project,0.096097
4,693,A Fighter's Blues,0.088095
5,1282,Skylark,0.08087
6,834,Lost and Love,0.07625
7,97,Mighty Aphrodite,0.069594
8,168,PT 109,0.068901
9,759,Surf II,0.068372


It did not wrok due to the sample od the data. May be in the sample, Toy Story was not picked.. let's make a litle trick to choice a good random seed that includes Toy Story title. 

In [15]:
# movies_df.loc[movies_df["title"] == "Big City Blues"]

In [16]:
# # This just iterates until toy story appears in the data sample. 
# rand = 0
# while (data_sample[data_sample["title"] == "Toy Story"].shape[0] > 0) == False:
#     data_sample = data_for_recomender.sample(3500, random_state=rand)
#     rand += 1

# print(rand - 1)
    

In [17]:
import psutil
process = psutil.Process()

print(f"Memory used: {process.memory_info().rss / 1024 / 1024:.2f} MB")

Memory used: 224.50 MB
