-
Notifications
You must be signed in to change notification settings - Fork 73
/
analyze_fifa_data.py
115 lines (80 loc) · 2.71 KB
/
analyze_fifa_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 29 09:05:34 2019
@author: sadrachpierre
"""
import numpy as np
import pandas as pd
df = pd.read_csv("data.csv")
print(df.head())
df = df[['Name', 'Age', 'Nationality', 'Value', 'Wage', 'Preferred Foot', 'Height', 'Weight', 'Position', 'Overall']]
print(df.head())
print(df['Height'].head())
Height_cm = []
for i in list(df['Height'].values):
try:
Height_cm.append((float(str(i)[0])*12.0 + float(str(i)[2:]))*2.54)
except(ValueError):
Height_cm.append(np.nan)
print(len(df))
df['Height_cm'] = Height_cm
print(df['Height_cm'].head())
print("Mean Height (cm): ", df['Height_cm'].mean())
print("Standard Deviation in Height (cm): ", df['Height_cm'].std())
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.title("Height Histogram")
df['Height_cm'].hist(bins = 10)
df.dropna(inplace = True)
df['Weight_kg'] = df['Weight'].str[:3].astype(float)/2.20462
print(df.head())
import matplotlib.pyplot as plt
def get_statistics(numeric_column_name):
print("Mean {}: ".format(numeric_column_name), df[numeric_column_name].mean())
print("Standard Deviation in {}: ".format(numeric_column_name), df[numeric_column_name].std())
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.title("{} Histogram".format(numeric_column_name))
df[numeric_column_name].hist(bins = 10)
get_statistics('Weight_kg')
from collections import Counter
print(dict(Counter(df['Nationality'].values).most_common(10)))
bar_plot = dict(Counter(df['Nationality'].values).most_common(5))
plt.bar(*zip(*bar_plot.items()))
plt.show()
def plot_most_common(category):
bar_plot = dict(Counter(df[category].values).most_common(5))
plt.bar(*zip(*bar_plot.items()))
plt.show()
plot_most_common('Position')
df['Age'] = df['Age'].astype(int)
df['Wage'] = df['Wage'].str[1:]
df['Wage'] = df['Wage'].str[:-1]
df['Value'] = df['Value'].str[:-1]
df['Value'] = df['Value'].str[1:]
wage_list = []
value_list = []
for i in list(df['Wage'].values):
try:
wage_list.append(float(i)*1e3)
except(ValueError):
wage_list.append(np.nan)
for i in list(df['Value'].values):
try:
value_list.append(float(i)*1e6)
except(ValueError):
value_list.append(np.nan)
df['Wage_numeric'] = wage_list
df['Value_numeric'] = value_list
numerical_columns = df[['Height_cm', 'Weight_kg', 'Value_numeric', 'Age', 'Wage_numeric']]
print(numerical_columns.head())
import seaborn as sns
sns.set()
sns.heatmap(numerical_columns.corr(), annot=True)
plt.show()
df = df[df['Nationality'].isin(['England', 'Germany', 'Spain'])]
sns.boxplot(x= df['Nationality'], y = df['Height_cm'])
plt.show()