## Intro to Pandas - part 2

#### NUMPY and PANDAS 

In [None]:
#import libraries 
import numpy as np 
import pandas as pd 

In [None]:
#check whats in my current working directory 
#!ls

In [None]:
pwd

In [None]:
cd '/Users/siandavies/Documents/working'

In [None]:
#move to a sub directory to check i have the files 
#cd Healthcare

In [None]:
#check the files are there and their names
#!ls

In [None]:
#move up one directory (to working folder) 
#cd ..

In [None]:
#read the first file - its a csv
file1=pd.read_csv('Healthcare/file1.csv')

In [None]:
#check the file 
file1.shape

In [None]:
#review the numeric properties and stats 
file1.describe()

In [None]:
#look at the info -including the nulls 
file1.info()

In [None]:
#review the top few rows 
file1.head()

In [None]:
#read the second file in - its a txt file, tab separated 
file2=pd.read_csv('Healthcare/file2.txt',sep='\t')

In [None]:
#check the file 
file2.head()

In [None]:
#check the same number of columns in the two files. 
# we could also check the headers are matching !
file2.shape

In [None]:
file1.shape

In [None]:
#read the file 3 - its an excel
file3=pd.read_excel('Healthcare/file3.xlsx',engine='openpyxl')

In [None]:
#same for file4 
file4=pd.read_excel('Healthcare/file4.xlsx',engine='openpyxl')

In [None]:
#for excel you might need an extra package. 
#i am checking to see if installed
#if not installed, then I close down jupyter, go to the terminal 
#and install the package with conda install openpyxl 
#!conda install openpyxl

In [None]:
#set column headers - this is not always necessary, but its good practice
columns=file1.columns

In [None]:
#check the variable 
columns

In [None]:
#create data frame 
data=pd.DataFrame(columns=columns)

In [None]:
data

In [None]:
#join the data frame with the first file 
data=pd.concat([data,file1],axis=0)

In [None]:
#join the data frame with the other files 
data=pd.concat([data,file2,file3,file4],axis=0)

In [None]:
#check the data frame - should be 4028 rows 
data.info()

In [None]:
data.head()

## some cleaning steps 
- change headers to lower case 
- revision of for loops 

In [None]:
cols=[]
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower())

#this is the function : data.columns.lower()

In [None]:
data.columns=cols

In [None]:
data.head()

### drop columns 

In [None]:
datanew=data.drop(['controln','hv1'],axis=1)

In [None]:
datanew.head()

#### Look at the end of the data with tail - notice the index is not refreshed?

In [None]:
datanew.tail()

In [None]:
datanew.info()

### reset index ! 

In [None]:
datanew.reset_index(drop=True, inplace=True)

## filtering the data frame using subsets or query()

In [None]:
filtered=datanew[(datanew['state']=='FL')&(datanew['gender']=='M')]

## california + females + avg gift greater than $10 

In [None]:
# 252 

filtered2=datanew[(datanew['state']=='CA')&(datanew['gender']=='F')&(datanew['avggift']>10)]

In [None]:
filtered2.shape

### same using .query() 

In [None]:
filtered3=datanew.query('gender=="F"&state=="CA"&avggift>10')

In [None]:
filtered3.info()

In [None]:
filtered3[['state','gender','avggift']]

## data cleaning - dtypes 

In [None]:
#im going back to the original data frame... 
data.shape

In [None]:
data.dtypes

In [None]:
# first we tried changing the column hv1 to a float 
#but then noticed it contains some strings - the dtype change does not work

In [None]:
data['hv1'].astype('float',errors='ignore')

In [None]:
data.dtypes

In [None]:
#lets use this function instead which has a coerce argument 

data['hv1']=pd.to_numeric(data['hv1'],errors='coerce')

In [None]:
data['ic1']=pd.to_numeric(data['ic1'],errors='coerce')

In [None]:
data['ic3']=pd.to_numeric(data['ic3'],errors='coerce')

In [None]:
data['ic5']=pd.to_numeric(data['ic5'],errors='coerce')

In [None]:
data.dtypes

### Some useful functions to look into a column 

In [None]:
data['hvp1'].unique()

In [None]:
data['hvp1'].value_counts()

### drop duplicate rows - this is a function

- without key arguments, we are only dropping complete duplicate rows

In [None]:
data=data.drop_duplicates()

In [None]:
data.shape

## extract resulting data frame as csv 

In [None]:
#data.to_csv('Healthcare/day1.csv',index=False)

# NULLS 

In [None]:
data.info()

In [None]:
#use this if we think we have any complete row nulls 

data.isna().all(axis=1)

In [None]:
#nulls snapshot table 
nullsdf=pd.DataFrame(round(data.isna().sum()/len(data),4)*100)

In [None]:
nullsdf=nullsdf.rename(columns={'index':'header',0:'proportionnulls'})

In [None]:
nullsdf

In [None]:
data[data['gender'].isna()==True]

In [None]:
#fill null genders with a new label 
data['gender']=data['gender'].fillna("Unknown")

### numeric columns - fill in - fill with mean 

In [None]:
hv1_mean=data["hv1"].mean()

In [None]:
# Replace nulls in hv1 with mean ! 

In [None]:
data['hv1']=data['hv1'].fillna(hv1_mean)

In [None]:
data.info()

## introducing lambda, map + list 

lambda an anonymous function - in place of ...

+ def functionname: 
    + return something ... 

In [None]:
data['gender'].unique()

In [None]:
multiplication= lambda x,y: x*y

In [None]:
multiplication(20,30)

In [None]:
#use lambda to make upper case 
data['gender']=list(map(lambda x: x.upper(),data['gender']))
                   

In [None]:
#use a logical statement to clean the gender values 

def clean_gender(x): 
    if x in ['M','MALE']:
        return 'M'
    elif x.startswith('F'):
        return 'F'
    else:
        return "UNKNOWN"

In [None]:
data['gender']=list(map(clean_gender,data['gender']))

In [None]:
# check it worked 
data['gender'].value_counts()

In [None]:
data.to_csv('Healthcare/day2.csv',index=False)