### Exploratory Data Analysis

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(color_codes= True)

In [None]:
# Read file as is into pandas dataframe
resort_df = pd.read_csv('Clean_CSV/oceanview_sunrise.csv',
                        sep=',',
                        engine='python',
                        index_col= False,
                        na_values = ['NA', 'N/A','NaN'],
                        header=None,skiprows = [0],
                        names = ['invoice_id','first_name','last_name','state','zip','check_in','bedrooms','weekly_rate',
                        'rent_discount','total_sale','parking','mgmt_discount','pool_access','repeat_customer',
                        'year','currency','address','city','county','customer_phone','personal_email','customer_id']) 
                           
resort_df.head().append(resort_df.tail())

In [None]:
# Transforming a DataFrame with GroupBy   
resort_df['rank_by_state'] = resort_df.groupby('state')['weekly_rate'].rank(ascending=False)
resort_df.sort_values(by='state', ascending=False).head()

In [None]:
# New value sum (setup condition)
sum_total = resort_df['weekly_rate'] + resort_df['weekly_rate'] * resort_df['rent_discount'] + resort_df['total_sale']
resort_df.insert(loc=len(resort_df.columns), column='sum_total', value=sum_total)
resort_df.head().append(resort_df.tail())

In [None]:
# Total_sale for 2021 and first 2 quarters of 2022
sns.countplot(x='year', hue='total_sale', data=resort_df)

In [None]:
# Sort the rows of DataFrame by 'year' column
resort_df.sort_values(by = 'year')

# Creating groupBy object
resort_df.groupby('state') 

# Counting the groups by 'state'
resort_df.groupby('state').ngroups

# Accessing only group names 
resort_df.groupby('state').groups.keys()

# Selecting a Pandas GroupBy Group
florida_df = resort_df.groupby('state').get_group("FL")
florida_df

# Florida only customers in pivot table
florida_pivot = pd.pivot_table(florida_df, index=['check_in']) 

florida_pivot.reset_index(inplace=True)
florida_pivot

In [None]:
# Spliting date into three columns
florida_pivot['year'] = pd.DatetimeIndex(florida_pivot['check_in']).year
florida_pivot['month'] = pd.DatetimeIndex(florida_pivot['check_in']).month
florida_pivot['day'] = pd.DatetimeIndex(florida_pivot['check_in']).day

florida_pivot

# Creating florida pivot table ordered by year, month, then day
sorted_fldf = florida_pivot.sort_values(by=['year','month','day'] , ascending=[True, True, True])
sorted_fldf

In [None]:
# Plot Florida totals sales by month
ax = sns.lineplot(x='month', y='sum_total', data = sorted_fldf).set_title("Florida Sums by Month")

In [35]:
# Container dictionary 'averages'
averages = {}

# Split the data into different 'state'
for date in resort_df['year'].unique():
    temp_df = resort_df[resort_df['state'] == 'FL']


# Apply an aggregation function
average = temp_df['total_sale'].mean()


# Combine the data into a DataFrame
averages['FL'] = [average]

In [None]:
# DataFrame column 'total_sale_average' for the state of 'FL'
aggregate_df = pd.DataFrame.from_dict(averages, orient='index', columns=['total_sale_average'])
aggregate_df

In [None]:
# Aggregating .mean() .groupby() 'state' and 'total_sale'
averages = resort_df.groupby('state')['total_sale'].mean()
averages

In [None]:
# Calculating the Standard Deviation .groupby() 'state' and 'total_sale'
standard_deviations = resort_df.groupby('state')['total_sale'].std()
standard_deviations

In [None]:
# Applying multiple aggregations with .agg()
import numpy as np
aggs = resort_df.groupby('state')['total_sale'].agg([np.mean, np.std, np.var])
aggs

In [None]:
# Filtering rows where the group's average' total_sale' price is less than 4500
resort_df.groupby('state').filter(lambda x: x['total_sale'].mean() < 4500)

In [None]:
# Function defines .max() and .min() for the given groupby() columns
def group_range(x):
    return x.max() - x.min()



ranges = resort_df.groupby(['state','weekly_rate'])['total_sale'].apply(group_range)
ranges

In [None]:
# Return the first two records of each group
resort_df.groupby(['state', 'total_sale']).head(2)

In [None]:
# First 10 rows for 'rank_by_state' column 
resort_df.loc[:10, 'rank_by_state']

In [None]:
# Descriptive statistics for all numeric columns  in DataFrame 
resort_df.describe()

In [None]:
# Crosstab() calculate true and false values
pd.crosstab([resort_df.mgmt_discount,resort_df.pool_access], resort_df.repeat_customer, margins=True)

In [None]:
# Pivot_table for sum of states
year_state_table = pd.pivot_table(resort_df, index=['state'], aggfunc='sum' ) 
year_state_table

In [None]:
# Prepare table for graph
sorted_df = year_state_table.sort_values(by='repeat_customer', ascending=False)

top_ten_repeat_states= sorted_df.iloc[:10]
top_ten_repeat_states

top_ten_repeat_states.reset_index(inplace=True)
top_ten_repeat_states

# Top 10 states for repeat_customers
sns.barplot(y='state', x='repeat_customer', orient = 'h', data=top_ten_repeat_states)

In [None]:
# Check columns name before ordering
my_list = resort_df.columns.values.tolist()
my_list

In [None]:
# Format a new layout using 'iloc' 
resort_df = resort_df.iloc[:, [0,1,2,3,22,4,5,6,7,8,9,23,10,11,12,13,14,15,16,17,18,19,20,21]]
resort_df

# Finalized DataFrame
resort_df

In [None]:
# Syntax to convert a dataframe to a csv
resort_df.to_csv('oceanview_sunrise_complete.csv',
                  index=False, # otherwise will add extra comma at start
                  sep=',',
                  encoding='utf-8')

In [None]:
my_list = resort_df.columns.values.tolist()
my_list