In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
def preprocess(df, split_column, drop_first=True):
    '''
    Does the usual preprocessing steps on a pandas dataframe:
    1. Does one-hot encoding
    2. Standardizes the data
    3. Drop N/A
    4. Splits the data into x & y along split_column
    5. Returns (x, y)
    '''
    stds = StandardScaler().set_output(transform='pandas')
    df = stds.fit_transform(pd.get_dummies(df, drop_first=drop_first)).dropna()
    x = df.drop(split_column, axis=1)
    return (x, df[split_column][x.index])

In [None]:
import sqlite3
import pandas as pd

In [None]:
conn = sqlite3.connect('resources/database.sqlite') 

In [None]:
df2 = pd.read_sql_query("SELECT * FROM player_attributes INNER JOIN Player ON Player.id = player_attributes.id", conn)
df2.shape

In [None]:
df1 = pd.read_csv('resources/final_data.csv')
df1.rename(columns={'name': 'player_name'}, inplace=True)

In [None]:
merged_df = pd.merge(df1, df2, on='player_name')
merged_df = merged_df.dropna()  # Drop rows with missing values
merged_df = merged_df.drop_duplicates(subset='player_name')  

merged_df.shape

In [None]:
merged_df['goal_per_appearance'] = merged_df['goals'] / merged_df['appearance']
merged_df['assist_per_appearance'] = merged_df['assists'] / merged_df['appearance']

In [None]:
merged_df.head()

In [None]:
merged_df.to_csv("resources/test.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor


(X, y) = preprocess(merged_df, 'goals')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

#Baseline
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
y_dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y_test, y_dummy_pred)
print(f'Baseline MSE: {baseline_mse}')



In [None]:
print(y.describe())

In [None]:
rmse = np.sqrt(0.0008381504680398298)
print("RMSE:", rmse)

In [None]:
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor(n_estimators=3, max_depth=3) #default is to use sqrt
rfc.fit(X_train,y_train)
print("Random Forest MSE: " + str(rfc.score(X_test,y_test)))