**This kernel is a adapted work of Data Science Tutorial for Beginners. The original tutorial can be found [here](https://www.kaggle.com/kanncaa1/data-sciencetutorial-for-beginners).**


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/student-alcohol-consumption/student-mat.csv")
data.head()

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
#correlation map

f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

**LIST COMPREHENSION**

freetime - free time after school (numeric: from 1 - very low to 5 - very high)

In [None]:
data["free_time_level"] = ["high" if i>2 else "low" for i in data["freetime"] ]
data.loc[:10,["free_time_level","freetime"]]


studytime - weekly study time (numeric: 1 - 10 hours)

In [None]:
data["study_time_level"] = ["high" if i>7 else  "normal" if 8>i>4 else "low" for i in data["studytime"] ]
data.loc[:10,["study_time_level","studytime"]]


# CLEANING DATA

In [None]:
data.info()

In [None]:
data.shape

It seems there is no missing value. 

value_counts(): Frequency counts

In [None]:
print(data["Mjob"].value_counts(dropna=False))

In [None]:
print(data["Fjob"].value_counts(dropna=True))

In [None]:
data.describe()

We can see that max study time a week is max 4 hours and min 1 hour so our code above to add another column for study time level is useless. Let's change iit. 

In [None]:
data["study_time_level"] = ["high" if i>2 else "low" for i in data["studytime"] ]
data.loc[:10,["study_time_level","studytime"]]

**Box plots: visualize basic statistics**

In [None]:
data.boxplot(column="goout", by="study_time_level" )

**MELTING DATA**

I want to find students with a health profession dad.

In [None]:
df_Fjob= data.loc[data.Fjob == "health"]
df_Fjob

In [None]:
melted_dads=pd.melt(frame=df_Fjob, id_vars= "Fjob", value_vars=["Dalc","Walc"])
melted_dads

**CONCATENATING DATA**

In [None]:
df1=data.head()
df2=data.tail()
concat_df_rows=pd.concat([df1,df2],axis=0)
concat_df_rows

In [None]:
df3=data["Walc"]
df4=data["Dalc"]
df_concat_coloum=pd.concat([df3,df4],axis=1)
df_concat_coloum

When x=0, we add rows, otherwise we add columns.

**DATA CONVERSION**

In [None]:
data.info()

In [None]:
data["sex"]=data["sex"].astype("category")
data.info()

We've changed from object/string type to category. Let's change from intiger to float

In [None]:
data["G3"]=data["G3"].astype("float")
data.dtypes

**MISSING DATA & ASSERT**

We did not have missing value in this data set so we made up some.

In [None]:
data["none"]=[None if i=="M" else 1 for i in data["sex"]]
data.head()

In [None]:
data["none"].value_counts(dropna =False)

In [None]:
data["none"].dropna(inplace=True)
data["none"].value_counts(dropna =False) #NaN is gone

In [None]:
assert data["none"].notnull().all()

It does not give errorr so THAT MEANS WHAT WE DID WORKED. *assert* helps us check things.

# **HARRY POTTER AND PANDAS HALLOWS**

**BULDING DATAFRAME** 

Let's say you are McGonagal and want to create dataframe for grades in transfiguration lesson.

First,we create list and dictionary, then using pd, we create dataframe.

In [None]:
student = ["Granger","Potter","Weasley"]
grade = ["AA", "BB","CC"]
list_label= ["student","grade"]
list_col=[student,grade]
zipped=list(zip(list_label,list_col))
hogw_dict=dict(zipped)
hogw_df=pd.DataFrame(hogw_dict)
hogw_df

In [None]:
hogw_df["SnapeHate"] = [5,10,7] #How much you hate Snape out of 10
hogw_df

In [None]:
hogw_df["HagridLove"]=10 #Broadcasting entire column
hogw_df

**INDEXING PANDAS TIME SERIES**


In [None]:
df2={"student":["Longbottom","Malfoy","Lovegood","Chang","Thomas"],
     "grade": ["CC","CB","BA","BB","CC"],"SnapeHate":[10,4,8,7,7],"HagridLove":[8,0,8,7,7]}
df2=pd.DataFrame(df2)
hogw_df =hogw_df.append(df2,ignore_index=True)
hogw_df

**DATA TIME SERIES**

In [None]:
time_list=["1979-09-19","1980-07-31","1980-03-01","1980-07-30","1980-07-05","1979-02-13","1979-05-30","1979-10-20"]
datetime_object=pd.to_datetime(time_list)
hogw_df["birthday"]=datetime_object
hogw_df=hogw_df.set_index("birthday")
hogw_df

Now we can select according to our birthday index. It is not string. We've changed to datetime.

In [None]:
print(hogw_df.loc["1980-07-31"])

In [None]:
print(hogw_df.loc["1979-05-30":"1980-07-05"]) #between September 9,1979 and July,5 1980

**RESAMPLING**

In [None]:
hogw_df.resample("A").mean() #resample the data according to year,mounth by calculating means. A=year

In [None]:
hogw_df.resample("M").mean() #M=mounth. A lot of nan because hogw_df does not include all months

In [None]:
#to fill NaN's, interpolate from first value

hogw_df.resample("M").first().interpolate("linear")

Strings do not interpolate ofcourse. Let's interpolate w/ mean.


In [None]:
hogw_df.resample("M").mean().interpolate("linear")

**VISUAL EXPLORATORY DATA ANALYSIS**

In [None]:
df_alc=data.loc[:,["Walc","Dalc"]]
df_alc.plot()

In [None]:
df_alc.plot(subplots=True)
plt.show()

In [None]:
data.plot(kind="scatter", x="age", y="Walc")
plt.show()

In [None]:
data.plot(kind = "hist",y = "age",bins = 50,range= (15,22))
plt.show()

# **Manipulating Dataframes with Pands**

In [None]:
data.head() # We can see that place for index is empty

In [None]:
data["index"] = np.arange(1, len(data)+1)
data.head() #we've created coloumn named index starting from 1

In [None]:
data= data.set_index("index")
data.head()

In [None]:
data["Fjob"][3] #one way way of sellecting data

In [None]:
data.Fjob[3] #another way of sellecting data

In [None]:
data[["Dalc","Walc"]] #choosing some coloumns

In [None]:
data.loc[5,["Walc"]] #using loc

data["Walc"] is series 

data[["Walc"]] is data frame.  

In [None]:
print(type(data["Walc"]))
print(type(data[["Walc"]]))

In [None]:
data.loc[1:10,"G1":"none"] #1 to 10 for rows, from G1 to None for coloumns

In [None]:
data.loc[10:1:-1,"G1":"none"] #reverse

In [None]:
data.loc[1:10,"G1":] #coloumns from G1 to end

**FILTERING DATA FRAMES**

In [None]:
boolean_variable= data["G1"]<10
data[boolean_variable]

OR

In [None]:
f1= data.G2 < 10 #first filter
f2 =data["G1"] > 10 #second filter
data[f1&f2] #their intersection, student scoring high in first exam but low in second exam

In [None]:
# Filtering column based
data.goout[data.Dalc>2]  #going out degree of students with high workday alcohol consumption

**TRANSFORMING DATA**

In [None]:
def daily(n):
    return n/7
data["dailystudy"]= data.studytime.apply(daily) #gives us daily study time
data.head()

In [None]:
#using lambda function 
data["studydaily"]=data.studytime.apply(lambda x: x/7 )
data.head()

In [None]:
data["totalscore"]= data.G1+data.G2+data.G3
data.head()

In [None]:
print(data.index.name)

In [None]:
#change index name
data.index.name="#"
data.head()

In [None]:
data1=data.set_index(["Mjob","Fjob"])
data1.head(100)

In [None]:
data.groupby("sex").mean()

In [None]:
data.groupby("Fjob").max()

In [None]:
data.groupby("school").Walc.mean()