In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
from datetime import datetime #datetime module

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Introduction to Python

## Reading Data


In [None]:
df = pd.read_csv("/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv")
df.head(10) # first 10 rows

In [None]:
df.info() # information about datas

In [None]:
df.tail() # default -> last 5 rows

In [None]:
df.columns # names of data columns

In [None]:
df.count() # data count in columns

In [None]:
df['Province/State'].value_counts(sort=True,ascending=True) # sort Province/State value in ascending order 

In [None]:
df.size # returns size of dataframe which is equivalent to total number of elements. That is rows x columns.

## Correlation Map

In [None]:
f,ax=plt.subplots(figsize=(12,12))
sns.heatmap(df.corr(),annot=True,linewidth=.5,fmt='.1f',cbar=True,ax=ax)
plt.show()

## Matplotlib

**LINE PLOT**

Line plot is better when x axis is time.

In [None]:
df.Deaths.plot(kind = 'line', color = 'red',label ='Deaths',linewidth=1,alpha = 0.5,grid = True,linestyle = '--')
plt.legend(loc='upper right') # puts label into plot
plt.xlabel('x axis')    # name of xlabel
plt.ylabel('y axis')    # name of xlabel
plt.title('Line Plot')  # title of plot
plt.show()

**SCATTER PLOT**

Scatter is better when there is correlation between two variables.

In [None]:
df.plot(kind='scatter', x="Deaths", y='Recovered', alpha=0.4, color='blue')
plt.xlabel('Deaths') # name of xlabel
plt.ylabel('Recovered') # name of ylabel
plt.title('Deaths Recovered Scatter Plot') #title of plot
plt.show()



**HISTOGRAM**

Histogram is better when we need to see distribution of numerical data.

In [None]:
df.Deaths.plot(kind = 'hist',bins = 100,figsize = (15,15))
plt.title("Histogram")
plt.show()

**DATE PLOT**

In [None]:
df.plot(x='ObservationDate',y='Recovered',color = 'green',label ='Recovered',linewidth=1,alpha = 0.5,grid = True,linestyle = '--')
plt.title('', color='black')
plt.xticks(rotation = 90) # rotates the labels 90 degrees.
#plt.tight_layout() 
plt._show()

In [None]:
#Other date plot
'''df['ObservationDate'] = df['ObservationDate'].map(lambda x: datetime.strptime(str(x), '%m/%d/%Y'))
x = df['ObservationDate']
y = df['Recovered']

plt.plot(x,y,color='pink')# plot
plt.gcf().autofmt_xdate()# beautify the x-labels
plt.show()'''

## Dictionary

In [None]:
currency = {
    "Dolar" : "USD",
    "Türk Lirası" : "TR",
    "Euro" : "EUR",
    "Sterlin" : "GBP"
}
print('My dictionary :',currency)
print(currency.keys())
print(currency.values())

In [None]:
x = currency.get('Euro') # get 'Euro''s value
print(x)

In [None]:
for k,v in currency.items(): # print key and value in dictionary
    print(k+' : '+v)

In [None]:
if "Sterlin" in currency: # check 'Sterlin' in dictionary
  print("Yes, 'Sterlin' is one of the keys in the currency dictionary")

In [None]:
currency['Kanada Doları']='CAD' # adding item
print(currency)

In [None]:
#currency.clear() #remove dictionary
#del currency # delete dictionary 

## Pandas

In [None]:
df[:8] # 0-8 rows

In [None]:
#Filtering
x = df['Deaths']>5000 
df[x]

In [None]:
df[(df['Recovered']>5000) & (df['Deaths']<500)]

In [None]:
(df.groupby(['ObservationDate','Country/Region']).sum().loc[lambda df: df['Deaths'] > 4000]) # data selection (date format -> %m %d %Y)

In [None]:
df.sample(n=6, weights='Deaths') # selecting random samples

## While and For Loops

In [None]:
i=0

while True:
    print(i,"Data Science")
    i +=1
    if i==6:
        break    #break the loop

In [None]:
#print columns names with while loop
i=0
while i<len(df.columns):
    print("Column",i, ':' ,df.columns[i])
    i +=1


In [None]:
#print columns names with for loop
for col in df.columns:
    print(col)

In [None]:
#For pandas we can achieve index and value
for index,value in df[['Province/State']][0:5].iterrows():
    print(index," : ",value)

In [None]:
# iterate over rows with iterrows()
for index, row in df.head(6).iterrows():
     # access data using column names
     print(index+1, row['Province/State'], row['Country/Region'])

# 2.Python Data Science Toolbox

## User Defined Function

In [None]:
#Create Tuple
x = ("C","Java","Python")

print(x)

In [None]:
x = ("C","Java","Python")
y = list(x)
y[0] = "C++"
x = tuple(y)

print(x)

In [None]:
#Values of tuple with for loop
for i in x:
    print(i)

print("\n")
(l1,l2,l3)=x
print("Values:",l1,l2,l3)

## Scope

In [None]:
x = 2 #Global scope

def f(y):
    result = y**x 
    return result

print(f(5))

In [None]:
count = 1

def func():
    for count in range(6):
        count +=1
    return count

print(count) # count = 1 global scope
print(func()) # count = 6 local scope


In [None]:
# How can we learn what is built in scope
import builtins
dir(builtins)

## Nested Function

In [None]:
def function1(): # outer function
    print ("Hello from outer function")
    def function2(): # inner function
        print ("Hello from inner function")
    function2()

function1()

In [None]:
# example finding number's square with nested function
def func1(x):
    def func2():
        result = x**2
        return result
    return func2()

#number = int(input("Please enter a number ")) with input value -> func1(number)
print(f"{5}'s square =",func1(5))

## Default and Flexible Arguments

In [None]:
# default argument
def func(lang='Python'):
    return lang

print("Programing language:" ,func())
print("Programing language:", func('C++'))

In [None]:
# flexible arguments *args
def func(*args):
    for i in args:
        print(i)

func(3,5,8,11)
list1=[1,2,3,4,5,6]
func(list1)
list2=[1,2,3,4,5,6],[2,5,7,8,9,10]
func(list2)

In [None]:
# flexible arguments **kwargs that is dictionary
def func(**kwargs):
    for key, value in kwargs.items():
        print(f"{key} -> {value}")
        
func(Class = 'Data Science', Part = '2')
        

## Lambda Function

In [None]:
# lambda function
negative = lambda x : -x
print(negative(5))

result = lambda a,b,c : a*b*c
print(result(2,5,10))

## Anonymous Function

In [None]:
import math # for sqrt function

num_list=[0,4,16,36]
func = map(lambda x:math.sqrt(x),num_list)
print(tuple(func))

## Iterators

In [None]:
# iteration example
subject = "DataScience"
it=iter(subject)
print(next(it))
print(next(it))
print(next(it))
print(next(it))
print(*it)

In [None]:
# zip example zip() -> zip lists
list1=["Dolar","Türk Lirası","Euro","Sterlin"]
list2=["USD","TR","EUR","GBP"]
z = tuple(zip(list1,list2))
print(z)

In [None]:
un_zip = zip(*z)
un_list1,un_list2 = tuple(un_zip)
print(un_list1)
print(un_list2)

## List Comprehension

In [None]:
list1 = [1,2,5,8,14,21]
list2 = [print(f"{i}: Odd") if i%2!=0 else print(f"{i}: Even") for i in list1]

In [None]:
# list comprehension for covid_19_data dataset
threshold = sum(df.Deaths)/len(df.Deaths) # average Deaths
print("Average Deaths:",threshold) 
# List values between 60000 and 80000 according to deaths level(high or low)
df["Deaths_Level"] = ["High" if i > threshold else "Low" for i in df.Deaths] 
df.loc[60000:80000,["Deaths_Level","Deaths","Country/Region"]] 

## **Result**<br>
As a result, in this table we can see high deaths level in the US, UK and Italy compared to other countries. (because of Covid-19)

# 3.Cleaning Data

## Diagnose Data for Cleaning 

In [None]:
df.head(15) #first 15 data

In [None]:
df.tail(15) #last 15 data

In these tables, we can see original data and the deaths level column we added in part 2.

In [None]:
df.shape # (row,column)

In [None]:
df.info() # information about datas

## Exploratory Data Analysis (EDA)

In [None]:
# For example lets look frequency of countries
print(df['Country/Region'].value_counts(dropna =False))

In [None]:
df.describe() # There may be problems with statistics account because there are too many 0 values.

In [None]:
df.mask(df == 0).describe() # 0 values are masked.

## Visual Exploratory Data Analysis

In [None]:
# For example: compare deaths covid-19 that are deaths_level high or not
# Black line at top is max
# Blue line at top is 75%
# Green line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers
df.boxplot(figsize=(15,15),column='Deaths',by = 'Deaths_Level') # There may be problems because there are too many 0 values and data.

## Tidy Data

In [None]:
# Firstly I create new data from covid_19 data to explain melt more easily.
data_new = df.loc[55591:55601]    # I only take 10 rows into new data
data_new

In [None]:
# lets melt
# id_vars = what we do not wish to melt
# value_vars = what we want to melt
melted = pd.melt(frame=data_new,id_vars = 'Country/Region', value_vars= ['Deaths','Recovered'])
melted

In this table, we can see deaths and recovered numbers by country/region.

## Pivoting Data

In [None]:
# Index is name
# I want to make that columns are variable
# Finally values in columns are value
melted.pivot_table(index = 'Country/Region', columns = 'variable',values='value')

## Concatenating Data

In [None]:
data1 = df.head()
data2= df.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row
conc_data_row # Concatenating data1 and data2 

In [None]:
data1 = df['Country/Region'].tail()
data2= df['Deaths'].tail()
data3= df['Recovered'].tail()
conc_data_col = pd.concat([data1,data2,data3],axis =1) # axis = 1 : adds dataframes in column
conc_data_col

## Data Types 

In [None]:
df.dtypes

In [None]:
# lets convert object(str) to categorical and float to int.
df['Deaths_Level'] = df['Deaths_Level'].astype('category')
df['Confirmed'] = df['Confirmed'].astype('int')

df.dtypes

## Missing Data and Testing With Assert 

In [None]:
# Lets look at does covid_19 data have nan value
# As you can see there are 68558 entries. However Province/State has 44125 non-null object so it has 24433 null object.
df.info()

In [None]:
# Lets chech Province/State
df["Province/State"].value_counts(dropna =False)
# As you can see, there are 24433 NAN value

In [None]:
# Lets drop nan values
data1=df   # also we will use df to fill missing value so I assign it to data1 variable
data1["Province/State"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?

In [None]:
# Lets check with assert statement
# Assert statement:
assert  df["Province/State"].notnull().all() # returns nothing because we drop nan values

In [None]:
df["Province/State"].fillna('empty',inplace = True)

In [None]:
assert  df["Province/State"].notnull().all() # returns nothing because we do not have nan values

In [None]:
# # With assert statement we can check a lot of thing. For example
assert df.columns[0] == 'SNo' #True
#assert df.Deaths_Level.dtypes == np.int #False

In [None]:
df["Province/State"].value_counts(dropna = False) # now there isn't nan value in table

# 4.Pandas Foundation

## Building Data Frames From Scratch

In [None]:
# data frames from dictionary
language = ["English","German","Turkish"]
level = ["B2","B1","C2"]
list_label = ["language","level"]
list_col = [language,level]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
print(data_dict)
data = pd.DataFrame(data_dict)
data

In [None]:
# Add new columns with list comprehension 
data["completed"]=["Yes" if i == "C2"  else "No" for i in data.level]
data

In [None]:
# Broadcasting
data["necessary"] = "Yes" #Broadcasting entire column
data

## Visual Exploratory Data Analysis

In [None]:
# Plotting all data 
data1 = df.loc[:,["Confirmed","Deaths","Recovered"]]
data1.plot()
# it is confusing

In [None]:
# subplots
data1.plot(subplots = True)
plt.show()

In [None]:
# scatter plot  
data1.plot(kind = "scatter",x="Confirmed",y = "Deaths",alpha=0.4,color="red")
plt.show()

In [None]:
# histogram plot  
data1.plot(kind = "hist",y = "Deaths",bins = 30,range= (0,500),density = True) # density was used instead of normed

In [None]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "Deaths",bins = 50,range= (0,500),density = True,ax = axes[0],color="yellow")
data1.plot(kind = "hist",y = "Deaths",bins = 50,range= (0,500),density = True,ax = axes[1],color="yellow",cumulative = True) #cumulative total
plt.savefig('graph.png')
plt

## Statistical Exploratory Data Analysis

In [None]:
df.describe()

## Indexing Pandas Time Series

In [None]:
time_list = ["2020-11-29","2020-07-30"]
print(type(time_list[1])) # As you can see date is string
# however we want it to be datetime object
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))

In [None]:
# close warning
import warnings
warnings.filterwarnings("ignore")

# In order to practice lets take covid-19 data and add it a time list
data2 = df
datetime_object = pd.to_datetime(data2.ObservationDate) # convert ObservationDate that is object to pandas time series
data2["date"] = datetime_object
# lets make date as index
data2= data2.set_index("date")
data2.head(10)

In [None]:
# Now we can select according to our date index
print(data2.loc["2020-06-11"])
print(data2.loc["2020-06-11":"2020-07-30"])

In this output, we can see results 11.06.2020 and between 11.06.2020-30.07.2020

## Resampling Pandas Time Series 

In [None]:
data2.resample("M").mean() # average covid-19 results by months

In [None]:
#data2.resample("M").first().interpolate("linear")
#data2.resample("M").mean().interpolate("linear")
# we didn't use interpolate because data already include all months

# 5.Manipulating Data Frames with Pandas 

## Indexing Data Frames 

In [None]:
# read data
data = pd.read_csv("/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv")
data= data.set_index("SNo")
data.head()

In [None]:
# indexing using square brackets
data["Country/Region"][55543]

In [None]:
# using column attribute and row label
data.Deaths[55543]

In [None]:
# using loc accessor
data.loc[1,["Country/Region"]]

In [None]:
# selecting only some columns
data[["Country/Region","Deaths","Recovered"]]

## Slicing Data Frame

In [None]:
# difference between selecting columns: series and dataframes
print(type(data["Confirmed"]))     # series
print(type(data[["Confirmed"]]))   # data frames

In [None]:
# slicing and indexing series
data.loc[55000:55010,"Country/Region":"Deaths"]   # 10 and "Deaths" are inclusive

In [None]:
# reverse slicing 
data.loc[55010:55000:-1,"Country/Region":"Deaths"]

In [None]:
# from something to end
data.loc[55000:55010,"Deaths":] 

## Filtering Data Frames

In [None]:
# creating boolean series
boolean = data.Deaths > 40000
data[boolean]

In [None]:
# combining filters
first_filter = data.Deaths < 5000
second_filter = data.Recovered > 50000
data[first_filter & second_filter]

In [None]:
# filtering column based others
data.Deaths[data.Recovered>20000]

## Transforming Data

In [None]:
# plain python functions
def square(n):
    return n**2
data.Confirmed.apply(square)

In [None]:
# or we can use lambda function
data.Confirmed.apply(lambda n : n**2)

In [None]:
# defining column using other columns
data["gap"] = data.Recovered - data.Deaths
data.loc[55000:55010]

## Index Objects and Labeled Data

In [None]:
# our index name is this:
print(data.index.name)
# lets change it
data.index.name = "Index"
data.head()

In [None]:
# Overwrite index
# if we want to modify index we need to change all of them.
data.head()
# first copy of our data to data3 then change index 
data3 = data.copy()
# lets make index start from 100. It is not remarkable change but it is just example
data3.index = range(100,233710,2)
data3.head()

In [None]:
# data= data.set_index("#") or  data.index = data["#"]

## Hierarchical Indexing 

In [None]:
# lets read data frame one more time to start from beginning
data = pd.read_csv("/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv")
data.head()
# As you can see there is index. However we want to set one or more column to be index

In [None]:
# Setting index : Country/Region is outer Province/State is inner index
data1 = data.set_index(["Country/Region","Province/State"]) 
data1.head(10000)

## Pivoting Data Frames

In [None]:
dic = {"job":["Engineer","Engineer","Chef","Chef"],"gender":["M","F","M","F",],"experience":[0,5,12,8],"age":[22,28,40,32]}
df = pd.DataFrame(dic)
df

In [None]:
# pivoting
df.pivot(index="job",columns = "gender",values="age")

## Stacking and Unstacking DataFrame

In [None]:
df1 = df.set_index(["job","gender"])
df1

In [None]:
# level determines indexes
df1.unstack(level=0)

In [None]:
df1.unstack(level=1)

In [None]:
# change inner and outer level index position
df2 = df1.swaplevel(0,1)
df2

## Melting Data Frames

In [None]:
df

In [None]:
pd.melt(df,id_vars="job",value_vars=["age","experience"])

## Categoricals and GroupBy

In [None]:
df

In [None]:
# according to job take means of other features
df.groupby("job").mean()   # mean is aggregation / reduction method
# there are other methods like sum, std,max or min

In [None]:
# we can only choose one of the feature
df.groupby("job").age.max() 

In [None]:
# Or we can choose multiple features
df.groupby("job")[["age","experience"]].min() 

In [None]:
df.info()
# as you can see gender is object
# However if we use groupby, we can convert it categorical data. 
# Because categorical data uses less memory, speed up operations like groupby
#df["gender"] = df["gender"].astype("category")
#df["job"] = df["job"].astype("category")
#df.info()