In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import scale
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

warnings.simplefilter("ignore")
sns.set_style("whitegrid")

In [None]:
df = pd.read_csv('/kaggle/input/international-football-results-from-1872-to-2017/results.csv').dropna()
df.head()

In [None]:
# Matches with the most goals scored

In [None]:
df.iloc[:,3:5].values
total_score = np.array([])
for i in range(len(df)):
    score = df.iloc[:,3:5].values[i].sum()
    total_score = np.append(total_score, score)
sort = np.flip(total_score.argsort())

In [None]:
rank_bound = 20
df.iloc[sort[:rank_bound],:].sort_values("date").style.background_gradient("Reds")

In [None]:
# Tournament with the most matches

In [None]:
rank_bound = 10
ax = df.tournament.value_counts()[:rank_bound].sort_values()
value = ax.values
label = ax.index

plt.figure(figsize=(14,6))
plt.barh(y=label, width=value, edgecolor="k")
for i in range(rank_bound):
    plt.text(x=50,y=i-0.1,s=value[i],color="w",fontsize=12)
plt.show()

In [None]:
# Country with the most matches

In [None]:
rank_bound = 10
ax = df.country.value_counts()[:rank_bound].sort_values()
value = ax.values
label = ax.index

plt.figure(figsize=(14,6))
plt.barh(y=label, width=value, edgecolor="k")
for i in range(rank_bound):
    plt.text(x=10,y=i-0.1,s=value[i],color="w",fontsize=12)
plt.show()

In [None]:
# City with the most matches

In [None]:
rank_bound = 10
ax = df.city.value_counts()[:rank_bound].sort_values()
value = ax.values
label = ax.index

plt.figure(figsize=(14,6))
plt.barh(y=label, width=value, edgecolor="k")
for i in range(rank_bound):
    plt.text(x=5,y=i-0.1,s=value[i],color="w",fontsize=12)
plt.show()

In [None]:
# Histogram for years

In [None]:
years = []
for date in df.date:
    years.append(int(str(date)[0:4]))
plt.figure(figsize=(14,6))
plt.hist(years, density=True, bins=12, edgecolor="k")
plt.title("Histogram of Years")
plt.ylabel("Frequency")
plt.xlabel("Year")
plt.show()

In [None]:
# How many points did each country collect?

In [None]:
def TotalPoint(country, friendly=False, average=False, return_point=False, goal=False):
    if friendly==True:
        point = []
        home = 0
        ax = df[(df["home_team"]==country) | (df["away_team"]==country)]
        team = ax.iloc[:,1:3].values
        score = ax.iloc[:,3:5].values
        g =[]
        c =[]
        for i in range(len(ax)):
            home = 0
            if team[i][0]==country:
                home = 1
            if score[i][0] == score[i][1]:
                point.append(1)
            if home ==1 and score[i][0] > score[i][1]:
                point.append(3)
            if home == 0 and score[i][0] < score[i][1]:
                point.append(3)
            else:
                point.append(0)
            if home==0:
                g.append(score[i][1])
                c.append(score[i][0])
            else:
                g.append(score[i][0])
                c.append(score[i][1])
    else:
        point = []
        home = 0
        ax = df[(df["home_team"]==country) | (df["away_team"]==country) & (df["tournament"]!="Friendly")]
        team = ax.iloc[:,1:3].values
        score = ax.iloc[:,3:5].values
        g =[]
        c =[]
        for i in range(len(ax)):
            home = 0
            if team[i][0]==country:
                home = 1
            if score[i][0] == score[i][1]:
                point.append(1)
            if home == 1 and score[i][0] > score[i][1]:
                point.append(3)
            if home == 0 and score[i][0] < score[i][1]:
                point.append(3)
            if home == 1 and score[i][0] < score[i][1]:
                point.append(0)
            if home == 0 and score[i][0] > score[i][1]:
                point.append(0)
            if home==0:
                g.append(score[i][1])
                c.append(score[i][0])
            else:
                g.append(score[i][0])
                c.append(score[i][1])
                
    point = pd.Series(point)
    
    if average==True and return_point==False and goal==False:
        return point.sum()/len(point)
    if average==False and return_point==False and goal==False:
        return point.sum()
    if return_point==True and goal==False:
        return point.values
    if goal==True:
        return g,c

In [None]:
point_data = {"Country":[],
             "Total Point":[],
             "Average Point":[],
             "Scored":[],
             "Concede":[],
             "Number of Game":[]}
for country in pd.concat([df.home_team, df.away_team], axis=0).unique():
    point_data["Country"].append(country)
    point_data["Total Point"].append(int(TotalPoint(country, friendly=False)))
    point_data["Average Point"].append(float(TotalPoint(country, friendly=False, average=True)))
    point_data["Number of Game"].append(len(TotalPoint(country, return_point=True)))
    g,c=TotalPoint(country, goal=True)
    point_data["Scored"].append(sum(g))
    point_data["Concede"].append(sum(c))
point_df = pd.DataFrame(point_data).dropna()
point_df.head()

In [None]:
rank = len(point_df)
ax = point_df.sort_values("Total Point", ascending=False).set_index("Country").head(rank)
ax["Rank"] = np.arange(1,rank+1)
ax.head(rank).style.background_gradient("Reds")

In [None]:
X = scale(point_df.drop("Country",axis=1))
tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(X)

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(X_embedded[:,0], X_embedded[:,1])
plt.title('t-SNE with no Labels' , size=18, fontweight='bold', fontfamily='monospace')
plt.show()

In [None]:
cluster = KMeans(n_clusters=5)
cluster.fit(X)
y_pred = cluster.predict(X)

In [None]:
palette = sns.hls_palette(5, l=.4, s=.9)
plt.figure(figsize=(15,5))
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, palette=palette)
plt.title('t-SNE with no Labels' , size=18, fontweight='bold', fontfamily='monospace')
plt.show()

In [None]:
point_df["K means"] = y_pred
point_df.head()

In [None]:
point_df[point_df["K means"]==0]

In [None]:
point_df[point_df["K means"]==1]

In [None]:
point_df[point_df["K means"]==2]

In [None]:
point_df[point_df["K means"]==3]

In [None]:
point_df[point_df["K means"]==4]