
# Data Manipulating with Python
Set up some basic packages:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
new_df = pd.read_csv(path/file.csv)
new_df.to_csv(path/file.csv)

## Inspecting a DataFrame

In [None]:
# Print the head of the homelessness data
print(homelessness.head())

# Print information about homelessness: shows information on each of the columns, such as the data type and number of missing values
print(homelessness.info())

# Print the shape of homelessness:  returns the number of rows and columns of the DataFrame.
print(homelessness.shape)

# Print a description of homelessness:  calculates a few summary statistics for each column.
print(homelessness.describe())
# Print the column index of homelessness
print(homelessness.columns)

# Print the row index of homelessness
print(homelessness.index)
#Multiple aggregation
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([np.mean,np.median])

#Cumulative statistics 
sales[['col1','col2']].cummax() #returns a new Series where each element is the maximum of all previous elements up to that position


### 1. Sorting and Subsetting

In [None]:
#Sort column
dp.sort_values('column', ascending = True) #By default, it's true
#Sort multiple
dp.sort_values(['column1','column2'],ascending =[True,False] )
#Subsetting column 1 
dp[['column1','column2']]
# Subsetting column 2
col_sub = ['column1','column2']
dp[col_sub]
# Subsetting rows on condition
dp[dp[column]>2]
#Subsetting on multiple conditions
dp[(condition1) & (condition2) ]
dp[dp['colour'].isin(['black','red'])]

### 2. Manipulating DataFrame

In [None]:
#Add a new column
df['new_col'] = df['col'] * 100 
## Drop Duplicates
df.drop_duplicates(subset=['col1','col2'])

### 3. Grouped Summary statistics

In [None]:
# Aggregate by for all columns
books.agg({"rating": ["mean","std"],"year": ["median"]})

# Group by 'Department' and calculate mean salary
grouped_mean = df.groupby('Department')['Salary'].mean()

# Group by 'Department' and calculate sum and mean salary
grouped_stats = df.groupby('Department')['Salary'].agg(['sum', 'mean'])
# Create 3 new columns for aggregated calculation
borough_stats = schools.groupby('borough').agg(
    num_schools=('school_name','size'), #  the number of schools in the borough.
    average_SAT=('total_SAT','mean'), # the mean of column "total_SAT".
    std_SAT = ('total_SAT','std')).round(2) # the standard deviation of column "total_SAT"

### 4. Pivot Table

In [None]:
# Creating a pivot table
#*.pivot_table() takes the mean of each group by default
pivot = df.pivot_table(  
                       values='Revenue', 
                       index='Date', 
                       columns='Department', 
                       aggfunc=[sum,np.mean],
                       fill_value = 0, # fill na with 0
                       margin = True # summing all rows and columns
                       )



#### a. Working with Pivot

In [None]:
# Get the mean by columns
mean_temp_by_year = temp_by_country_city_vs_year.mean(axis = "index") # Get the mean by index axis = "columns"

# Filter for the columns that had the highest mean
mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()]

### Data filtering in Pandas
Since ``df['column']`` return the result as ``Series``
One way of filtering the data is using a boolean series:

In [None]:
# Filter by one condition
df['column'] = condition # Output a series of ['true','false',...]
con = (df['column']= condition)
# return the data with this condition
result = df[con] 
#i.e:
np.logical_and(brics["area"] > 8, brics["area"] < 10)
brics[np.logical_and(brics["area"] > 8, brics["area"] < 10)]

# Filter multiple conditions
movie_short = netflix_df
[
    (netflix_df['type'] == 'Movie')
    &(netflix_df['genre'] == 'Action')
    & (netflix_df['release_year'] >= 1990) 
    & (netflix_df['release_year'] < 2000)
    &(netflix_df['duration'] <=90)
]

# Filter sting columns:

# Filter the rows that contains '@acacia.com'
print(users[users['email'].str.contains('@acacia.com')])

## Slicing and Indexing DataFrames

### 1. Explicit indexes

In [None]:
#Set index
dogs_ind = dogs.set_index("name")
#Remove index
dogs_ind = dogs.reset_index()
#drop index
dogs_ind = dogs.reset_index(drop = True)
#Subsetting through index
dogs_ind.loc["ind1","ind2"]
#set pairs as index
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [('Brazil','Rio De Janeiro'),('Pakistan','Lahore')]
# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])
#Sorting index
dogs_ind3.sort_index(level=["color","breed"], ascending=[True, False])



### 2. Slicing and Subsetting

#### a. Using ``Loc``

In [None]:
# Slicing the outer index level
dogs_srt.loc["Chow Chow":"Poodle"]
# Slicing the inner index levels
dogs_srt.loc[("Labrador","Brown"):("Schnauzer","Grey")]
# Slicing columns
temperatures_srt.loc[:,'date':'avg_temp_c']
# Slicing rows and columns:
temperatures_srt.loc[('India','Hyderabad'):('Iraq','Baghdad'),'date':'avg_temp_c']

#### b. Using ``iloc``   

In [None]:
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22,1])

# Use slicing to get the first 5 rows
print(temperatures.iloc[:5])

# Use slicing to get columns 3 to 4
print(temperatures.iloc[:,2:4])

# Use slicing in both directions at once
print(temperatures.iloc[:5,2:4])

## Data Visualization from DataFrame

### 1. Histogram

In [None]:
dog_pack["column1"].hist(bins=20)
plt.show()


### 2. Bar plot

In [None]:
# Create a group by
avg_weight_by_breed = dog_pack.groupby("breed")["weight_kg"].mean()
# Create bar
avg_weight_by_breed.plot(kind="bar"title="Mean Weight by Dog Breed")
plt.show()

### 3. Line plot

In [None]:
# Create line
sulLy.plot(x="date"y="weight_kg",kind="line")
plt.show()
# Rotating axis labels
sully.plot(x="date",y="weight_kg",kind="line", rot=45)

### 4. Scatter plots

In [None]:
# Create line Scatter
dog_pack.plot(x="height_cm",y="weight_kg", kind="scatter")

### 5. Histgram

In [None]:
dog_pack[dog_pack["sex"]=="F"]["height_cm"].hist(alpha=0.7,bins = 20) # alpha is the transparency
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist(alpha=0.7,bins = 20)
plt.legend(["F","M"])

## Dealing with Missing Values

### 1. Detect Missing Value

In [None]:
#Detecting Missing Value
df.isna() # Return each columns and rows
#Detecting any Missing value
df.isna().any() # Return T/F of the summary of column
# Counting missing value
df.isna().sum()
# Plotting missing values
df.isna().sum().plot(kind = 'bar')



### 2. Dealing Missing value
- Drop missing values
    5% or less of total values
- Impute mean, median, mode
  Depends on distribution and context
- Impute by sub-group 
 Different experience levels have different median salary

In [None]:
# Checking for missing values
df.isna().sum()

# If the NA is only a small amount and doesn't really matter
df.dropna()
# Replace with 0
df.fillna(0)

# Find the columns that have the missing value <=0.5 total value
threshold = len(salaries) * 0.05
cols_to_drop = df.columns[salaries.isna().sum() <= threshold]
# Drop the NA in these columns
salaries.dropna(subset=cols_to_drop, inplace=True)

# Impute a summary statistics
cols_with_missing_values = df.columns[df.isna().sum()> 0]
for col in cols_with_missing_values[:-1]:
    df[col].fillna(df[col].mode()[0])

# Impute by sub-group
salaries_dict = salaries.groupby("Experience")["Salary_UsD"].median().to_dict()
# Output: {'Entry':!55380.0，'Executive':135439.0，'Mid':74173.5，'Senior':128903.0}
salaries["Salary UsD"] = salaries["Salary usp"].fillna(salaries["Experience"].map(salaries_dict))


## Create DataFrame

### Create rows by rows through a list of dictionary
list_of_dicts =[
    {"name":"Ginger",
    "breed":"Dachshund",
    "height_cm": 22,
    "weight_kg":10,
    "date_of_birth":"2019-03-14"}]
new_dogs = pd.DataFrame(list_of_dicts)

### Create colum by column through a dictionary of list

In [3]:
dict_of_lists = {
    "name":["Ginger","Scout"],
    "breed":["Dachshund","Dalmatian"],
    "height_cm":[22,59],
    "weight_kg":[10,25],
    "date_of_birth":["2019-03-14","2019-05-09"]}
new_dogs2 = pd.DataFrame(dict_of_lists)