## Import required packages

In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from joblib import dump


## Load the dataset

In [11]:

data = pd.read_csv('vgsales.csv', index_col='Rank')
data


Unnamed: 0_level_0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...
16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


## Preprocessing

In [12]:

columns_to_drop = ['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
data.drop(columns_to_drop, axis=1, inplace=True)
data


Unnamed: 0_level_0,Platform,Year,Genre,Publisher,Global_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Wii,2006.0,Sports,Nintendo,82.74
2,NES,1985.0,Platform,Nintendo,40.24
3,Wii,2008.0,Racing,Nintendo,35.82
4,Wii,2009.0,Sports,Nintendo,33.00
5,GB,1996.0,Role-Playing,Nintendo,31.37
...,...,...,...,...,...
16596,GBA,2002.0,Platform,Kemco,0.01
16597,GC,2003.0,Shooter,Infogrames,0.01
16598,PS2,2008.0,Racing,Activision,0.01
16599,DS,2010.0,Puzzle,7G//AMES,0.01


In [13]:
data.isnull().sum()

Platform          0
Year            271
Genre             0
Publisher        58
Global_Sales      0
dtype: int64

In [14]:
# Handle missing values in the 'Year' column
data['Year'] = data['Year'].fillna(data['Year'].mean())


# Drop rows with any remaining missing values
data = data.dropna(axis=0)
data.isnull().sum()



Platform        0
Year            0
Genre           0
Publisher       0
Global_Sales    0
dtype: int64

In [15]:
data

Unnamed: 0_level_0,Platform,Year,Genre,Publisher,Global_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Wii,2006.0,Sports,Nintendo,82.74
2,NES,1985.0,Platform,Nintendo,40.24
3,Wii,2008.0,Racing,Nintendo,35.82
4,Wii,2009.0,Sports,Nintendo,33.00
5,GB,1996.0,Role-Playing,Nintendo,31.37
...,...,...,...,...,...
16596,GBA,2002.0,Platform,Kemco,0.01
16597,GC,2003.0,Shooter,Infogrames,0.01
16598,PS2,2008.0,Racing,Activision,0.01
16599,DS,2010.0,Puzzle,7G//AMES,0.01


## Encoding


In [16]:

label_encoder_platform = LabelEncoder()
label_encoder_genre = LabelEncoder()
label_encoder_publisher = LabelEncoder()

data.loc[:, 'Platform'] = label_encoder_platform.fit_transform(data['Platform'])
data.loc[:, 'Genre'] = label_encoder_genre.fit_transform(data['Genre'])
data.loc[:, 'Publisher'] = label_encoder_publisher.fit_transform(data['Publisher'])



## Split data and training

In [17]:

X = data[['Platform', 'Year', 'Genre', 'Publisher']]
y = data['Global_Sales']

model = LinearRegression()
model.fit(X, y)


## Dump the trained model

In [18]:
dump(model, 'video_game_sales_prediction.joblib')


['video_game_sales_prediction.joblib']