In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LinearRegression

# ****Include DataSet****

In [None]:
df = pd.read_csv("../input/videogamesales/vgsales.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

# Delete Unnecessary Variables For Data Science

We will not use "Rank", "Name", "Year", "Publisher" variables in this data set. So we will delete these variables from the data set using the "drop" function.

In this function, the "axis" parameter determines whether it will be a row or a column. If we set "axis = 1", the column will be deleted.

The "inplace" parameter is marked as "False" in the default setting. If we change this to "True", changes will be automatically saved in the data set.


If we do not use the "inplace" parameter, we will have to do it """df = df.drop (["Rank", "Name", "Year", "Publisher"], axis = 1)"""

In [None]:
df.drop(["Rank","Name","Year","Publisher"],axis=1,inplace=True)
df.head()

# Converting Categorical Variables To Numeric variables


Categorical variables are transformed with the "get_dummies" function in pandas.

In [None]:
dums = pd.get_dummies(df[["Platform","Genre"]])
dums.head()

After the categorical variables are converted, any transformed new variable belonging to each variable is selected and deleted.

Because the value of the deleted variable can be understood by looking at the other transformed variables already remaining.


For example, if all non-deleted variables are 0, it means that the deleted variable must be 1.

In [None]:
dums.drop(["Platform_2600","Genre_Misc"],axis=1,inplace=True)


Then, a new data set is created by combining the data set with the newly created variables. The old categorical variables that have been transformed are removed from the data set because they are no longer needed.

In [None]:
final_df= pd.concat([df,dums],axis=1)
final_df.drop(["Platform","Genre"],axis=1,inplace=True)
final_df.head()


This data set is now ready to be used. Let's start.

# Simple Linear Regression

### Outlier Control

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
g = sns.regplot(final_df.Global_Sales,final_df.EU_Sales,ci=None,scatter_kws= {"color":"r","s":9});
plt.xlim(-2,85)
plt.ylim(bottom=0)

As you can see, there is 1 extreme outlier in the variable "EU_Sales", so let's eliminate this value.

In [None]:
final_df.EU_Sales[df.EU_Sales>15]
#this value is in index 0.

In [None]:
df_outlier = final_df.drop([0],axis=0) 

In [None]:
import matplotlib.pyplot as plt
g = sns.regplot(df_outlier.Global_Sales,df_outlier.EU_Sales,ci=None,scatter_kws= {"color":"r","s":9});
plt.xlim(-2,45)
plt.ylim(bottom=0)

Let's choose dependent and independent variables.

In [None]:
x = df_outlier[["EU_Sales"]]
y = df_outlier["Global_Sales"]


To process the data, let's create the model and fit this model with X and Y variables.

In [None]:
reg = LinearRegression()
model = reg.fit(x,y)


Let's look at the score of the model we created.

In [None]:
model.score(x,y)


After creating the model, let's make it guess by giving certain values. (It predicts the "Global_Sales" variable according to the "EU_Sales" variable.)

In [None]:
model.predict([[15]])

In [None]:
est = [[12],[30],[50]]
model.predict(est)

# Multiple Linear Regression


In multiple linear regression, this difference is selected as dependent on one variable and all remaining variables are selected as independent variables.

In [None]:
x=final_df.drop("Global_Sales",axis =1)
y = final_df.Global_Sales

Now we need to divide our data set into "test" and "train". Because after training our program from the "train" set, we will test it with our "test" set to see the error value.

For this we will use the **"train_test_split"** function in the **"scikit-learn"** module.

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.20,random_state = 13)
#test_size = "selects what percentage of the data set will get as "test""

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
#Let's look at the shape information

In [None]:
lm = LinearRegression()
model = lm.fit(x_train,y_train)
#Let's create our model and train "x_train" and "y_train" sets.


After training our model, let's look at the total error squares (margin of error) with the "mean_squared_error" method in the scikit-learn module.


The model is provided to generate estimates by using the "x_test" set. It is then checked how close this generated prediction is to the "y_test" set.

The number becomes readable by inserting it into the squaring function in the numpy module.

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(x_test)
np.sqrt(mean_squared_error(y_test,y_pred))