In [2]:
# import the modules
# --------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("./train.csv")

### Load the dataset

- Load the train data and using all your knowledge of pandas try to explore the different statistical properties of the dataset.

In [3]:
print('Skewness for the different features is as shown below: ')
print(df.skew())

Skewness for the different features is as shown below: 
Id          -0.000627
Age          0.387514
Overall     -0.006657
Potential    0.204984
Value (M)    6.597601
Wage (M)     7.382001
dtype: float64


In [4]:
# sns.heatmap(df.corr())

In [5]:
# Selecting upper and lower threshold
upper_threshold = 0.5
lower_threshold = -0.5


# List the correlation pairs
correlation = df.corr().unstack().sort_values(kind='quicksort')

# Select the highest correlation pairs having correlation greater than upper threshold and lower than lower threshold
corr_var_list = correlation[((correlation>upper_threshold) | (correlation<lower_threshold)) & (correlation!=1)]
print(corr_var_list)

Id         Overall     -0.975595
Overall    Id          -0.975595
Id         Potential   -0.653503
Potential  Id          -0.653503
Id         Value (M)   -0.548213
Value (M)  Id          -0.548213
Id         Wage (M)    -0.519570
Wage (M)   Id          -0.519570
Potential  Wage (M)     0.512910
Wage (M)   Potential    0.512910
Overall    Wage (M)     0.589736
Wage (M)   Overall      0.589736
Potential  Value (M)    0.595095
Value (M)  Potential    0.595095
Overall    Value (M)    0.635618
Value (M)  Overall      0.635618
Overall    Potential    0.678228
Potential  Overall      0.678228
Wage (M)   Value (M)    0.845124
Value (M)  Wage (M)     0.845124
dtype: float64


### Visualize the data

- Check for the categorical & continuous features. 
- Check out the best plots for plotting between categorical target and continuous features and try making some inferences from these plots.
- Check for the correlation between the features

In [6]:
# print(df.columns)
# df_group = df.groupby(['Position']).sum()
# #Code starts here
# sns.countplot(x='Position', data=df)

# value_distribution_values = df.sort_values("Wage (M)", ascending=False).reset_index().head(100)[["Name", "Wage (M)"]]
# sns.countplot(x='Wage (M)', data=value_distribution_values)
# # value_distribution_values = df[]

# overall = df.sort_values("Overall")

# overall_value = overall.groupby(['Overall'])['Value (M)'].mean()# 

# # Code ends here

In [7]:

# p_list_1= ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']

# p_list_2 = ['GK', 'LWB', 'CB', 'RWB', 'LM', 'CDM', 'CAM', 'CM', 'RM', 'LW', 'RW']
  
# # p_list_1 stats
# df_copy = df.copy()
# store = []
# for i in p_list_1:
#     store.append([i,
#                     df_copy.loc[[df_copy[df_copy['Position'] == i]['Overall'].idxmax()]]['Name'].to_string(
#                         index=False), df_copy[df_copy['Position'] == i]['Overall'].max()])
# df_copy.drop(df_copy[df_copy['Position'] == i]['Overall'].idxmax(), inplace=True)
# # return store
# df1= pd.DataFrame(np.array(store).reshape(11, 3), columns=['Position', 'Player', 'Overall'])


# # p_list_2 stats
# df_copy = df.copy()
# store = []
# for i in p_list_2:
#     store.append([i,
#                     df_copy.loc[[df_copy[df_copy['Position'] == i]['Overall'].idxmax()]]['Name'].to_string(
#                         index=False), df_copy[df_copy['Position'] == i]['Overall'].max()])
# df_copy.drop(df_copy[df_copy['Position'] == i]['Overall'].idxmax(), inplace=True)
# # return store
# df2= pd.DataFrame(np.array(store).reshape(11, 3), columns=['Position', 'Player', 'Overall'])

# if df1['Overall'].mean() > df2['Overall'].mean():
#         print(df1)
#         print(p_list_1)
# else:
#     print(df2)
#     print(p_list_2)

### Model building

- Separate the features and target and then split the train data into train and validation set.
- Now let's come to the actual task, using linear regression, predict the `Value (M)`. 
- Try improving upon the `r2_score` (R-Square) using different parameters that give the best score. You can use higher degree [Polynomial Features of sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) to improve the model prediction. 



In [8]:
# Code Starts here


# --------------
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import train_test_split


# Code starts here
X = df[['Overall','Potential','Wage (M)']]
y = df['Value (M)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test,y_pred)
print("r2", r2)

mae = mean_absolute_error(y_test, y_pred)
print("mae", mae)

# Code ends here


# --------------
from sklearn.preprocessing import PolynomialFeatures

# Code starts here
poly = PolynomialFeatures(3)
X_train_2 = poly.fit_transform(X_train)
X_test_2 = poly.transform(X_test)

model = LinearRegression()
model.fit(X_train_2, y_train)

y_pred_2 = model.predict(X_test_2)

r2 = r2_score(y_test,y_pred_2)
print("r2", r2)

mae = mean_absolute_error(y_test, y_pred_2)
print("mae", mae)
# Code ends here




# Code ends here

r2 0.7676309781948667
mae 1.3718341450247453
r2 0.9481242645946444
mae 0.5118790302908705


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [11]:
test3=pd.read_csv("./test.csv")

In [12]:
# Code Starts here
Id =test3['Id']
print(Id)

# Code ends here

0         280
1         569
2        8731
3       10085
4        9831
        ...  
3592     5584
3593     4629
3594     6123
3595      664
3596     4238
Name: Id, Length: 3597, dtype: int64


In [13]:
test3=test3[['Overall','Potential','Wage (M)']]

In [14]:
test3.head()

Unnamed: 0,Overall,Potential,Wage (M)
0,82,82,0.046
1,79,85,0.105
2,67,74,0.007
3,65,65,0.002
4,66,67,0.001


In [15]:
test_3 = poly.transform(test3)

In [16]:
pred = model.predict(test_3)

In [17]:
submission_file1 = pd.DataFrame({'Id' : Id, 'Value' : pred})
submission_file1.to_csv('submission3.csv', index = False)