In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('vgsales.csv')

# Example: Assuming df is your full DataFrame
X = df.drop('Global_Sales', axis=1)
y = df['Global_Sales']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Identify categorical and numerical columns
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

# Define preprocessors
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Fit and predict
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)


In [84]:
from sklearn.metrics import mean_squared_error, r2_score
MSE = mean_squared_error(y_test, predictions)
R2 = r2_score(y_test, predictions)



In [92]:
ls_result = pd.DataFrame(['Linear Regrations', MSE, R2]).transpose()
ls_result.columns = ['Method', 'Test MSE', 'Test R2']
ls_result

Unnamed: 0,Method,Test MSE,Test R2
0,Linear Regrations,0.000183,0.999932


In [46]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Sample dataset
# df = pd.read_csv('your_data.csv')
X = df.drop('Global_Sales', axis=1)
y = df['Global_Sales']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify column types
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

# Final pipeline with RandomForestRegressor
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit and predict
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

# Evaluation
print("MSE:", mean_squared_error(y_test, predictions))
print("R²:", r2_score(y_test, predictions))


MSE: 0.7344999510371271
R²: 0.8283063375186285


In [47]:
rf_MSE = mean_squared_error(y_test, predictions)
rf_R2 = r2_score(y_test, predictions)

In [54]:
rf_result = pd.DataFrame(['RandomForest Regressor', rf_MSE, rf_R2]).transpose()
rf_result.columns = ['Method', 'Test MSE', 'Test R2']
rf_result

Unnamed: 0,Method,Test MSE,Test R2
0,RandomForest Regressor,0.7345,0.828306


In [53]:
df_model = pd.concat([ls_result, rf_result], axis=0).reset_index(drop=True)

df_model

NameError: name 'ls_result' is not defined

In [26]:
import pandas as pd
import numpy as np

In [6]:
# df = pd.read_csv("vgsales.csv")

df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [14]:
df.isna().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [13]:
df.dropna(subset=['Year'], inplace=True)
df.dropna(subset=['Publisher'], inplace=True)

In [16]:
y = df['Global_Sales']
y

0        82.74
1        40.24
2        35.82
3        33.00
4        31.37
         ...  
16593     0.01
16594     0.01
16595     0.01
16596     0.01
16597     0.01
Name: Global_Sales, Length: 16291, dtype: float64

In [20]:
x = df.drop("Global_Sales", axis=1)

x

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00
...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00


In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_feature = ['Name' , 'Platform', 'Genre', 'Publisher']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, cat_feature)], remainder='passthrough')

transformer_x = transformer.fit_transform(x)
transformer_x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 136313 stored elements and shape (16291, 11950)>

In [38]:
np.random.seed(42)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(transformer_x, y, test_size = 0.2)

model = RandomForestRegressor()
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

RR = model.score(X_test, Y_test)

RR

0.8320466612019755

In [39]:
rf_result_2 = pd.DataFrame(['RandomForest Regressor', RR]).transpose()
rf_result_2.columns = ['Method', 'Test R2']
rf_result_2

Unnamed: 0,Method,Test R2
0,RandomForest Regressor,0.832047


In [43]:

rf_result_2

Unnamed: 0,Method,Test R2
0,RandomForest Regressor,0.832047


In [51]:

rf_result

Unnamed: 0,Method,Test MSE,Test R2
0,RandomForest Regressor,0.7345,0.828306


In [52]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()

X_train, X_test, Y_train, Y_test = train_test_split(transformer_x, y, test_size = 0.2)

linear.fit(X_train, Y_train)

y_linear_train_pred = linear.predict(X_train)
y_linear_test_pred = linear.predict(X_test)

from sklearn.metrics import mean_squared_error , r2_score

lr_train_msr = mean_squared_error(y_train , y_linear_train_pred)
lr_train_r2 = r2_score(y_train, y_linear_train_pred)

lr_test_msr = mean_squared_error(y_test, y_linear_test_pred)
lr_test_r2 = r2_score(y_test, y_linear_test_pred)

print('lr_train_msr:', lr_train_msr)
print('lr_train_r2:', lr_train_r2)
print('lr_test_msr:', lr_test_msr)
print('lr_test_r2:', lr_test_r2)

lr_train_msr: 4.602737459047776
lr_train_r2: -1.3006160511414264
lr_test_msr: 6.256902407324375
lr_test_r2: -0.46258755849492306
