In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

soccer_data = pd.read_csv('Kaggle_Data/footbal_FIFA_subset.csv')

soccer_data.head()

In [None]:
soccer_data.dtypes

In [None]:
soccer_data['Wage'] = soccer_data['Wage'].str[1:]
soccer_data['Wage'] = soccer_data['Wage'].str.replace('K', '000')
soccer_data['Wage'] = soccer_data['Wage'].astype(np.int64)
soccer_data.head()
soccer_data.dtypes

In [None]:
# subset data to only include columns for Normal Sale Condition

len(soccer_data)

subset_data = soccer_data[soccer_data['Wage'] > 10000]

subset_data.head()

len(subset_data)

In [None]:
subset_data.isna().sum()

In [None]:
subset_data.columns

In [None]:
if True: 
    subset_data.drop(columns = ['Agility', 'Stamina', 'Penalties'], inplace = True)
    # subset_data.drop(columns = ['Agility', 'Age', 'Penalties'], inplace = True)
    # subset_data.drop(columns = ['Agility', 'Age', 'Stamina'], inplace = True)

In [None]:
for col in subset_data.drop(columns = ['Wage']).columns:
    plt.scatter(subset_data[col], subset_data['Wage'], color = 'red', marker = 'o')
    plt.xlabel(col)
    plt.ylabel('Wage')
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['Wage']), subset_data['Wage'], test_size=0.25, random_state=35)

X_train
X_test
y_train
y_test

In [None]:
clf = DecisionTreeRegressor(random_state=50, min_samples_leaf = 3)

clf = clf.fit(X_train, y_train) 


In [None]:
from sklearn.tree import export_text
r = export_text(clf)
print(r)

In [None]:
clf.feature_importances_

In [None]:
test_output = pd.DataFrame(clf.predict(X_test), index = X_test.index, columns = ['pred_Wage'])
test_output.head()

In [None]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_Wage'] - test_output['Wage']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

In [None]:
abs(test_output['pred_Wage'] - test_output['Wage']).mean()/test_output['Wage'].mean()


#### Visualize data

In [None]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [None]:
if False: 
    cols = X_train.columns
    for col in cols:
        plot_data = []
        plot_data.append(go.Scatter(x= X_train[col], y= y_train, name = 'Train data actual', mode = 'markers'))
        plot_data.append(go.Scatter(x= X_train[col], y= clf.predict(X_train), name = 'Train data predicted', mode = 'markers'))
        layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'Wage'), 
                           title = 'Plot of predicted and actual')
        fig = go.Figure(data= plot_data, layout=layout)
        plotly.offline.iplot(fig)

        plot_data = []
        plot_data.append(go.Scatter(x= X_test[col], y= y_test, name = 'Test data actual', mode = 'markers'))
        plot_data.append(go.Scatter(x= X_test[col], y= clf.predict(X_test), name = 'Test data predicted', mode = 'markers'))

        layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'Wage'), 
                           title = 'Plot of predicted and actual')
        fig = go.Figure(data= plot_data, layout=layout)
        plotly.offline.iplot(fig)


In [None]:
if False: 
    plot_data = []
    plot_data.append(go.Scatter(x= X_train['Overall'], y= X_train['Age'], name = 'Train data actual', mode = 'markers'))
    layout = go.Layout(xaxis = dict(title='Overall'), yaxis = dict(title= 'Age'), 
                       title = 'Plot of Main Variables')
    fig = go.Figure(data= plot_data, layout=layout)
    plotly.offline.iplot(fig)


