# Importing Libraries

In [105]:
import numpy as np
import pandas as pd
import os

from nest_asyncio import apply
from numpy.ma.core import remainder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import (
MinMaxScaler,
OrdinalEncoder,
FunctionTransformer,
MaxAbsScaler,
)
import ast

# Reading Data

In [2]:
def read_data(filename)->pd.DataFrame:
    path=os.path.join("..","data",filename)
    return pd.read_csv(path)

In [20]:
train=read_data("train.csv")

### genres

In [9]:
train.genres

0         open_world_survival_craft sandbox survival 2d ...
1         multiplayer mini_golf golf casual sports funny...
2         co-op action fps heist multiplayer looter_shoo...
3         cinematic online_co-op multiplayer turn-based_...
4         open_world dark_comedy multiplayer post-apocal...
                                ...                        
724950    simulation adventure sports casual indie racin...
724951    simulation adventure sports casual indie racin...
724952    simulation adventure sports casual indie racin...
724953    simulation adventure sports casual indie racin...
724954    simulation adventure sports casual indie racin...
Name: genres, Length: 724955, dtype: object

### release_year

In [29]:
train.release_year.unique()

array([2011, 2023, 2013, 2020, 2019, 2016, 2021, 2022, 2015, 2014, 2018,
       2017, 2024, 2009, 2010, 2006, 2007, 2012])

### number_of_reviews_from_purchased_people

In [30]:
train.number_of_reviews_from_purchased_people

0         1061260.0
1           22129.0
2          431019.0
3           13539.0
4           61554.0
            ...    
724950        196.0
724951        196.0
724952        196.0
724953        196.0
724954        196.0
Name: number_of_reviews_from_purchased_people, Length: 724955, dtype: float64

In [21]:
train.columns

Index(['game_name', 'short_description', 'long_description', 'genres',
       'minimum_system_requirement', 'recommend_system_requirement',
       'release_year', 'developer', 'publisher', 'overall_player_rating',
       'number_of_reviews_from_purchased_people', 'number_of_english_reviews',
       'link', 'review', 'helpful', 'funny', 'recommendation', 'username',
       'user_id', 'game_id', 'game_avg_hrs_played', 'user_avg_hrs_played'],
      dtype='object')

In [6]:
train.overall_player_rating.unique()

array(['Overwhelmingly Positive', 'Very Positive', 'Mostly Positive',
       'Mixed', 'Mostly Negative'], dtype=object)

In [102]:
sklearn.set_config(transform_output="default")

# Final Feature Engineering PipeLine

In [68]:
OrdinalEncoder(categories=[['Overwhelmingly Positive', 'Very Positive', 'Mostly Positive','Mixed', 'Mostly Negative']]).fit_transform(train[["overall_player_rating"]]).shape

(724955, 1)

In [47]:
def Order_encoder(ser:pd.Series):
    order_list=['Overwhelmingly Positive', 'Very Positive', 'Mostly Positive','Mixed', 'Mostly Negative']
    return ser.iloc[:,0].apply(lambda x: order_list.index(x) if x in order_list else -1).values.reshape(-1,1)
Order_encoder(train[["overall_player_rating"]]).shape

(724955, 1)

In [86]:
train.dtypes

game_name                                   object
short_description                           object
long_description                            object
genres                                      object
minimum_system_requirement                  object
recommend_system_requirement                object
release_year                                 int64
developer                                   object
publisher                                   object
overall_player_rating                       object
number_of_reviews_from_purchased_people    float64
number_of_english_reviews                   object
link                                        object
review                                      object
helpful                                      int64
funny                                        int64
recommendation                              object
username                                    object
user_id                                      int64
game_id                        

In [108]:
train.recommendation.value_counts()

recommendation
Recommended        580435
Not Recommended    144520
Name: count, dtype: int64

In [94]:
CountVectorizer(dtype=np.int32).fit_transform(train.genres)

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 15509315 stored elements and shape (724955, 353)>

In [107]:
game_detail_transform_3.fit_transform(train[["genres","release_year","number_of_reviews_from_purchased_people","overall_player_rating","game_avg_hrs_played"]])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18174662 stored elements and shape (724955, 357)>

In [117]:
FunctionTransformer(lambda ser: ser.to_numpy()).fit_transform(train[["game_id"]])

(724955, 1)

In [138]:


game_detail_transform_1=ColumnTransformer(
    transformers=[
        ("GenresTransformer",CountVectorizer(),"genres"),
        ("overall_player_rating_transform",FunctionTransformer(lambda ser: ser.apply( lambda x:['Overwhelmingly Positive', 'Very Positive', 'Mostly Positive','Mixed', 'Mostly Negative'].index(x)  ).to_frame()),"overall_player_rating")
    ],remainder="passthrough"
)
game_detail_transform_2=Pipeline(
    steps=[
        ("game_detail_transformer_1",game_detail_transform_1),
        ("scaler_release_year",MaxAbsScaler()),
    ]
)
game_detail_transform_3=ColumnTransformer(
    transformers=[
        ("game_detail_transformer_2",game_detail_transform_2,["genres","release_year","number_of_reviews_from_purchased_people","overall_player_rating","game_avg_hrs_played"]),
        
    ],remainder="passthrough"
)

column_transformer=ColumnTransformer(
    transformers=[
        ("game_id",FunctionTransformer(lambda ser: ser.to_numpy().reshape(-1,1)),"game_id"),
        ("game details transform",game_detail_transform_3,["genres","release_year","number_of_reviews_from_purchased_people","game_avg_hrs_played","overall_player_rating"]),
        ("user_id_pass",FunctionTransformer(lambda ser: ser.to_numpy().reshape(-1,1)),"user_id"),
        ("minmaxscaler",MaxAbsScaler(),["user_avg_hrs_played"]),
        ("target_transformer",FunctionTransformer(lambda ser: ser.apply( lambda x:["Not Recommended","Recommended"].index(x)  ).to_numpy().reshape(-1,1)),"recommendation")
    ],remainder="drop"
)
column_transformer

In [15]:
def read_data(filename)->pd.DataFrame:
    path=os.path.join("..","data",filename)
    return pd.read_csv(path)

# train dataset transform

In [139]:
column_transformer.fit_transform(train[["genres","release_year","number_of_reviews_from_purchased_people","game_avg_hrs_played","overall_player_rating","game_id",
                                        "user_avg_hrs_played","user_id","recommendation"
                                        ]])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20916967 stored elements and shape (724955, 361)>