In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('../input/international-football-results-from-1872-to-2017/results.csv')

In [None]:
data.info()

In [None]:
data.corr()

In [None]:
f,ax = plt.subplots(figsize=(13, 13))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
# Line Plot
data.home_score.plot(kind = 'line', color = 'g',label = 'Home Score',linewidth=1,alpha = 0.5,grid = True,linestyle = ':')
data.away_score.plot(color = 'r',label = 'Away Score',linewidth=1, alpha = 0.5,grid = True,linestyle = '-.')
plt.legend(loc='upper right')    
plt.xlabel('x axis')              # label = name of label
plt.ylabel('y axis')
plt.title('Line Plot')            # title = title of plot
plt.show()

In [None]:
# Scatter Plot 
data.plot(kind='scatter', x='home_score', y='away_score',alpha = 0.5,color = 'red')
plt.xlabel('home_score')              # label = name of label
plt.ylabel('away_score')
plt.title('Home Score - Away Score ') 

In [None]:
# Histogram
data.home_score.plot(kind = 'hist',bins = 50,figsize = (12,12))
plt.show()

In [None]:
x = data['home_score'] > 5
data[x]

In [None]:
y = data['away_score'] > 5
data[y]

In [None]:
data[(data['home_score']>3) & (data['away_score']>3)]

In [None]:
data.head() # head shows first 5 rows

In [None]:
data.tail() # tail shows last 5 rows

In [None]:
# columns gives column names of features
data.columns

In [None]:
# shape gives number of rows and columns in a tuble 
data.shape

In [None]:
# info gives data type like dataframe, number of sample or row, number of feature or column
data.info()

In [None]:
# value_counts() : Frequency counts 
print(data['home_team'].value_counts(dropna="False")) # if there are nan values that also be counted.

In [None]:
print(data['away_team'].value_counts(dropna="False"))

In [None]:
print(data['city'].value_counts(dropna="False"))

In [None]:
# outliers: The value is considerably higher or lower from rest of the data
# count: Number of entries
# mean: Average of entries
# std: Standart deviation
# min: Minimum entry
# 25%: First quantile
# 50%: Median or Second quantile
# 75%: Third quantile
data.describe()

In [None]:
# Box Plots: Visualize basic statistics like outliers, min/max or quantiles
data.boxplot(column='home_score', by='neutral')
plt.show()

In [None]:
#Tidy Data - melt()
data_new = data.head()
data_new

In [None]:
# Melt etmek : Datayı farklı bir yapıya büründürmek
melted = pd.melt(frame = data_new, id_vars = 'home_team', value_vars = ['away_team', 'home_score'])
print(melted)

In [None]:
# Concatenating Data - We can concatenate two DATAFRAME
data1 = data.head()
data2 = data.tail()
conc_data_row = pd.concat([data1, data2], axis = 0, ignore_index = True)
conc_data_row

In [None]:
data1 = data['home_score'].head()
data2 = data['away_score'].head()
conc_data_col = pd.concat([data1, data2], axis = 1)
conc_data_col

In [None]:
# Data Types
data.dtypes

In [None]:
data['home_team'] = data['home_team'].astype('category')
data['away_score'] = data['away_score'].astype('float')
data.dtypes

In [None]:
# Building Data Frame From Scratch
 # Data Frames From Dictionary
country = ["Spain", "France"]
population = ["11", "12"]
list_label = ["country", "population"]
list_col = [country, population]
zipped = list(zip(list_label, list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df

In [None]:
# Add new columns
df["capital"] = ["madrid", "paris"]
df

In [None]:
# Broadcasting
df["income"] = 0 #Broadcasting entire column
df

In [None]:
# Visual Exploratory Data Analysis
 # Plotting all data
data1 = data.loc[:, ["home_score", "away_score"]]
data1.plot()
# it is confusing

In [None]:
data1.plot(subplots = True)
plt.show()

In [None]:
# Scatter plot
data1.plot(kind = "scatter", x = "home_score", y = "away_score")
plt.show()

In [None]:
# Hist plot
data1.plot(kind = "hist", y="home_score", bins = 50, range=(0,5), normed = False)
plt.show()

In [None]:
# Indexing Pandas Time Series
time_list = ["1992-03-08", "1992-04-12"]
print(type(time_list[1])) # date is string
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object)) 

In [None]:
import warnings
warnings.filterwarnings("ignore")
data2 = data.head()
date_list = ["1992-01-10", "1992-02-10", "1992-03-10", "1993-03-15", "1993-03-16"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
data2 = data2.set_index("date")
data2

In [None]:
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])

In [None]:
# Resampling Pandas Time Series
data2.resample("A").mean()

In [None]:
data2.resample("M").mean()

In [None]:
# INDEXING DATA FRAMES
data = pd.read_csv('../input/international-football-results-from-1872-to-2017/results.csv')
data.head()

In [None]:
# indexing using square brackets
data["home_score"][1]

In [None]:
# using column attribute and row label
data.home_score[1]

In [None]:
# using loc accessor
data.loc[1,"home_score"]

In [None]:
# selecting only some columns
data[["home_score","away_score"]]

In [None]:
# SLICING DATA FRAME
 # Difference between selecting columns: series and dataframe
print(type(data['home_score'])) # series
print(type(data[['home_score']])) # data frames

In [None]:
# sciling and indexing series
data.loc[1:10,"home_score":"away_score"] 

In [None]:
# reverse slicing
data.loc[10:1:-1,"home_score":"away_score"] 

In [None]:
# from something to end
data.loc[1:10,"tournament":]

In [None]:
# FILTERING DATA FRAMES
 # creating boolean series
boolean = data.home_score > 20
data[boolean]

In [None]:
# combining filters
first_filter = data.home_score > 5
second_filter = data.away_score > 3
data[first_filter & second_filter]

In [None]:
# filtering column based others
data.home_score[data.away_score>15]

In [None]:
# TRANSFORMING DATA
def div(n):
    return n/2
data.home_score.apply(div)

In [None]:
# or we can use lambda function
data.home_score.apply(lambda n : n/2)

In [None]:
# defining column using other columns
data["total_score"] = data.home_score + data.away_score
data.head()

In [None]:
# INDEX OBJECTS AND LABELED DATA
print(data.index.name)
data.index.name = "index_name"
data.head()

In [None]:
# overwrite index
 # first copy of our data to data3 then change index
data3 = data.copy()
 # lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,41640,1)
data3.head()

In [None]:
### HIERARCHICAL INDEXING
data = pd.read_csv('../input/international-football-results-from-1872-to-2017/results.csv')
data.head()
data1 = data.set_index(["country","city"])
data1.tail(20)

In [None]:
# PIVOTING DATA FRAMES
dic = {"treatment":["A","A","B","B"], "gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df

In [None]:
 # pivoting
df.pivot(index="treatment", columns="gender",values="response")

In [None]:
# STACKING AND UNSTACKING DATAFRAME
df1 = df.set_index(["treatment","gender"])
df1

In [None]:
df1.unstack(level=0)

In [None]:
df.unstack(level=1)

In [None]:
df2 = df1.swaplevel(0,1)
df2

In [None]:
# MELTING DATA FRAMES
 # reverse of pivoting
df
pd.melt(df,id_vars="treatment",value_vars=["age","response"])

In [None]:
# CATEGORICALS AND GROUPBY
df.groupby("treatment").mean()

In [None]:
df.groupby("treatment").age.mean()

In [None]:
df.groupby("treatment")[["age","response"]].min()

In [None]:
df.info()