# Exercício

Procure uma base de dados em um dos seguintes sites: 
https://www.kaggle.com/datasets ou
https://archive.ics.uci.edu/ml/index.php

A base escolhida deve ter pelo menos 5000 linhas e 10 colunas. Em seguida, utilizando sua criatividade e os conceitos que você aprendeu nos dois jupyter notebooks anteriores do pandas, tente realizar operações interessantes na base de dados escolhida.  


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
warnings.filterwarnings('ignore')

In [None]:
arquivo = '/kaggle/input/fifa19eda/fifa_eda.csv'
dataset = pd.read_csv(arquivo, sep=',' ,header=0)
dataset.shape

In [None]:
type(dataset)

In [None]:
dataset.head()

# Treating Missing Values

In [None]:
dataset.isnull().sum()

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.dropna(how='all', inplace=True)

In [None]:
#preenche com -1 os values null da coluna 
dataset['International Reputation'].fillna(-1, inplace=True)
dataset['Skill Moves'].fillna(-1, inplace=True)
dataset['Club'].fillna(0, inplace=True)
dataset['Value'].fillna(0, inplace=True)
dataset['Contract Valid Until'].fillna(0, inplace=True)

In [None]:
dataset.describe()

# Number of righties and lefties

In [None]:

pe = dataset['Preferred Foot'].value_counts()
plot = pe.plot.bar()

# Listing only Brazilians


In [None]:
dataset.loc[dataset['Nationality']=='Brazil']

In [None]:
#Quantidade de brasileiros
dataset.loc[dataset['Nationality']=='Brazil'].count()

In [None]:
dataset.loc[dataset['International Reputation']== 5] 


# Brazilian players salary in k

In [None]:

salario = dataset.loc[dataset.Nationality=='Brazil','Wage']
salario.plot.hist()

# 10 most common ages

In [None]:

import matplotlib
%matplotlib inline
idades =  dataset['Age'].value_counts()[:10]

plot = idades.plot.bar()

# 10 most common nationalities

In [None]:

pais = dataset['Nationality'].value_counts()[:10]

plot = pais.plot.bar()


# Listing players with potential greater than 90

In [None]:
potencial = dataset.loc[dataset['Potential']> 90] 
potencial['Name']


# Brazilians with potential greater than 90

In [None]:
br = dataset[(dataset.Nationality == 'Brazil') & (dataset.Potential >= 90)]
br['Name']

In [None]:
%matplotlib inline
dataset.plot(x='Wage',y='Potential',kind='scatter', title='Salário x Potencial',color='r')

In [None]:
dataset.describe()

In [None]:
dataset.plot(x='Potential',y='Skill Moves',kind='scatter', title='Potential x Skill Moves',color='r')

In [None]:
#media das salarios
np.mean(dataset['Wage'])

In [None]:
#desvio padrão das idades
np.std(dataset['Age'])

In [None]:
#media de brasileiros
np.mean(dataset['Nationality']=='Brazil')

In [None]:
dataset.plot(x='Wage',y='Release Clause',kind='scatter', title='Wage x Release Clause',color='c')

In [None]:
dataset.plot(x='Joined',y='Release Clause',kind='scatter', title='Joined x Release CLause',color='c')

In [None]:
dataset.plot(x='Joined',y='Wage',kind='scatter', title='Joined x Wage',color='c')

# Creating X train e Y train

In [None]:
dataset.columns

In [None]:
correlacoes = dataset.corr()

In [None]:
#mapa de correlações
plt.figure(figsize = (15, 8))
sns.heatmap(data=correlacoes,
            annot=True,
            linewidths=.5,
            cmap="inferno")
plt.show()

# Normalizing

In [None]:
X = dataset[['Overall','Skill Moves','Potential', 'International Reputation', 'Value']].values
y = dataset[['Wage']].values

In [None]:

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

# Linear Regression

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2)

In [None]:
model = LinearRegression().fit(X_train, Y_train)

In [None]:
preds = model.predict(X_test)
print(preds)

In [None]:
mse_error = mean_squared_error(Y_test, preds)
r2_score_evaluate = r2_score(Y_test, preds)

print('MSE:', mse_error) 
print('R2:', r2_score_evaluate) 