# **Football Teams Rating**

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading the Data

In [None]:
df = pd.read_csv('../input/football-teams-rankings-stats/Football teams.csv')
df

# Understanding the Data

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
import missingno as no
no.bar(df, color='pink')

In [None]:
sns.heatmap(df.isnull(), yticklabels='False', cmap='Greens')

In [None]:
df1 = df['Tournament'].value_counts()
plt.pie(df1.values, labels=df1.index, autopct='%0.2f%%')
plt.title('Tournaments Percentage', fontsize=15)
plt.show()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(18,10))
sns.barplot(x='Team',y='Goals', data=df[:10].sort_values(by='Goals',ascending=False))
plt.title('Top 10 Goal Scoring Teams', fontsize=15)

In [None]:
plt.figure(figsize=(10,5))
sns.stripplot(x=df.Tournament, y=df.Goals, palette='magma_r')
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.barplot(x=df.Tournament, y=df.Goals, palette='rainbow')
plt.title('Goals in Tournament', fontsize=15)

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(x='Team',y='Possession%',data=df[:10].sort_values(by='Possession%',ascending=False),palette='Paired')
plt.title("Top Possession Teams are", fontsize=15)

In [None]:
sns.kdeplot(x='Pass%',data=df)

In [None]:
sns.kdeplot(x='Possession%',data=df, color='r')

In [None]:
sns.kdeplot(x='AerialsWon',data=df, color='g')

In [None]:
plt.figure(figsize=(16,10))
tournaments = df.groupby('Tournament')['Goals','Pass%','Possession%','Shots pg'].sum().plot(kind='bar')
plt.title(" The different league details", fontsize=15)
plt.legend()
plt.show()

In [None]:
sns.distplot(df['red_cards'], color='red')

In [None]:
sns.distplot(df['yellow_cards'], color='y')

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(18,10))
sns.barplot(x='Team',y='Rating', data=df[:10].sort_values(by='Rating',ascending=False))
plt.title('Top 10 Teams Rating', fontsize=15)

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(), yticklabels='auto', annot=True, cmap='YlGn')
plt.show()

# Splitting the Data into Dependent and Independent Variables

In [None]:
x = df.drop(['Team', 'Tournament' ,'Rating'], axis=1)
y = df["Rating"]

# Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(x,y)
print(model.feature_importances_)

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=x.columns)
feat_imp.nlargest(5).plot(kind='barh')

# Training and Testing the Data

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=10)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = lr.predict(xtrain)
ypred_test = lr.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac1 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac1)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = knn.predict(xtrain)
ypred_test = knn.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac2 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac2)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=3, n_jobs=1)
rf.fit(xtrain, ytrain)

## Prediction

In [None]:
ypred_train = rf.predict(xtrain)
ypred_test = rf.predict(xtest)

## Accuracy

In [None]:
from sklearn import metrics
print("Accuracy of training data:", metrics.r2_score(ytrain, ypred_train)*100)
ac3 = metrics.r2_score(ytest, ypred_test)*100
print("Accuracy of testing data:", ac3)

## Error

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(ytest, ypred_test))
print('Mean Squared Error:', metrics.mean_squared_error(ytest, ypred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(ytest, ypred_test)))

# Comparing Accuracy

In [None]:
accuracy =  {ac1: 'Logistic Regression', ac2: 'KNN', ac3:'Random Forest'}

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(14, 10))
model_accuracies = list(accuracy.values())
model_names = list(accuracy.keys())
sns.barplot(x=model_accuracies, y=model_names, palette='rainbow')

As accuracy of **KNN Regressor** algorithm is more ie. **89.04%**

Hence we will save the model.

# Saving the Model

In [None]:
import pickle
pickle.dump(knn, open('model.pkl', 'wb'))

# Sample Prediction

In [None]:
p = model.predict([[67,	14.4,	57,	2,	57.7,	87.7,	11.8]])
print("Rating = ", round(float(p),2))