In [None]:
# DO NOT CHANGE

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
# from scipy.stats import pearsonr
import scipy.stats as stats 
from sklearn import preprocessing

plt.figure(figsize=(20, 20))

In [None]:
# DO NOT CHANGE

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# DO NOT CHANGE

df = pd.read_csv('/kaggle/input/fda-places/FDA_Places.csv')

In [None]:
df.shape     # Rows and columns in the dataset

In [None]:
df.head()    # Show the first five rows

In [None]:
df.tail()    # Show the last five rows

In [None]:
df.info()    # How python views the data types
             # If the number next to the variable name does not equal the number of rows in the shape
             # it means there is missing data

In [None]:
df.describe()

# A little data cleaning

1. Check for missing values
2. Check for duplicate data
3. Change column type
4. Remove data that will not add value (e.g., car id)

### Missing Data

In [None]:
# Just listing the columns and how many rows 
# for each have a missing value.

df.isnull().sum()

In [None]:
# Calculating the Missing Values % contribution in DF

df_null = df.isna().mean().round(4) * 100

df_null.sort_values(ascending=False).head()

In [None]:
# Plotting missing values

sns.heatmap(df.isnull(), cbar=False)

### Describe Child Poverty to use in deciding how to impute

In [None]:
df['ChildPoverty'].describe()

In [None]:
df['ChildPoverty'].median()

In [None]:
df['ChildPoverty'].mode()

In [None]:
# Fill missing values of ChildPoverty with the average of ChildPoverty (mean)

df[ 'ChildPoverty' ] = df.ChildPoverty.fillna( df.ChildPoverty.mean() )
df_null = df.isna().mean().round(4) * 100

df_null.sort_values(ascending=False).head()


### Looking for duplicates

In [None]:
# checking for duplicates

df.loc[df.duplicated()]

In [None]:
# This will drop all duplicate rows

df.drop_duplicates(keep = 'first', inplace = True) 

# keep - which duplicate to keep, default is none!

In [None]:
df.loc[df.duplicated()]

In [None]:
df.shape

### Changing the data type

In [None]:
# changing the datatype of countyFIPS.  It is being treated as a number but it is really a string.

df['CountyFIPS'] = df['CountyFIPS'].astype(str)

### Remove a single column

The FamilyWork column will be removed because it contains too many null values.

In [None]:
df = df.drop('FamilyWork',axis=1)  

### Remove multiple columns

In [None]:
drop_columns = {'Unemployment' }
df = df.drop(columns = drop_columns) # inplace=True not used so columns still exist. Just not in this instance.

In [None]:
df.shape

# Exploratory Data Analysis

## Examine BINGE column

In [None]:
# Mean BINGE value by State

plt.figure(figsize=(10,10))

dfx = pd.DataFrame(df.groupby(['StateDesc'])['BINGE'].mean().sort_values(ascending = False))
dfx.plot.bar()
plt.title('Mean BINGE Value for States')
plt.show()

In [None]:
# Basic statistics for BINGE

df['BINGE'].describe()

### Histograms

In [None]:
# Plot a histogram of BINGE
plt.figure(figsize=(30,10))
sns.histplot(df['BINGE'], kde = False).set_title('Histogram of BINGE')
#plt.show()
plt.savefig('xyz.png')

In [None]:
# Basic Distribution (Histogram) plot

plt.figure(figsize=(20,10))
plt.title('BINGE Distribution Plot')
sns.histplot(df['BINGE'],bins=50,kde=False)   # This allows control over the number of bins

### Boxplots

In [None]:
# Basic boxplot

sns.boxplot(y="BINGE", data=df,palette='rainbow')
# sns.boxplot(y="BINGE", data=df,palette='rainbow',orient='h')  change the orientation

In [None]:
sns.boxplot(data=df['BINGE'], orient="v", palette='Set3' ,whis=1.5,saturation=1, width=0.7)
plt.title("BINGE Boxplot", fontsize = 14, fontweight = 'bold')
plt.ylabel("BINGE Range", fontweight = 'bold')
plt.xlabel("Continuous Variable", fontweight = 'bold')

In [None]:
# Boxplot stratified by State

sns.boxplot(x='StateDesc', y='BINGE', data=df,palette='rainbow')

In [None]:
# Multiple boxplots
plt.figure(figsize=(20, 15))
plt.subplot(3,3,1)
sns.boxplot(y = 'BINGE', x = 'State', data = df)
plt.subplot(3,3,2)
sns.boxplot(y = 'CSMOKING', x = 'State', data = df)
plt.subplot(3,3,3)
sns.boxplot(y = 'LPA', x = 'State', data = df)
plt.subplot(3,3,4)
sns.boxplot(y = 'OBESITY', x = 'State', data = df)
plt.subplot(3,3,5)
sns.boxplot(y = 'SLEEP', x = 'State', data = df)

plt.show()

### Violinplots

In [None]:
# Violinplot stratified by carbody

sns.violinplot(y='DIABETES', data=df,palette='rainbow')

### Counting unique values for each variable

In [None]:
for column in df.columns:
    print(f"{column}: Number of unique values {df[column].nunique()}")
    print("==========================================================")

In [None]:
# Looping through all the 'object type' variables and counting how many rows for each unique variable.
object_col = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 30:
        object_col.append(column)
        print(f"{column} : {df[column].unique()}")
        print(df[column].value_counts())
        print("====================================")

# 

## Bar Charts

In [None]:
# Visualizing the number of counties for each State

plt.rcParams['figure.figsize'] = [15,8]
ax=df['State'].value_counts().plot(kind='bar',stacked=True, colormap = 'Set1')
ax.title.set_text('State')
plt.xlabel("Names of the State",fontweight = 'bold')
plt.ylabel("Count of Counties",fontweight = 'bold')

### Count number of rows for each unique value of a variable

In [None]:
x= df['State'].value_counts()
x

### Multiple bar charts in a figure

In [None]:
plt.figure(figsize=(25, 6))

plt.subplot(1,3,1)
plt1 = df['State'].value_counts().plot(kind='bar')
plt.title('States')
plt1.set(xlabel = 'State', ylabel='Frequency')

plt.subplot(1,3,2)
plt1 = df['StateAbbr'].value_counts().plot(kind='bar')
plt.title('State Abbreviation')
plt1.set(xlabel = 'State Abbreviation', ylabel='Frequency')

plt.subplot(1,3,3)
plt1 = df['GenderMajority'].value_counts().plot(kind='bar')
plt.title('Gender Majority')
plt1.set(xlabel = 'Gender', ylabel='Frequency of Car Body')

### Binning numeric values into groups

In [None]:
pop_category = pd.cut(df.TotalPopulation,bins=[0,25000,100000,150000,1000000],labels=['Small','Medium','Large','Metro'])
df.insert(4,'Pop Group',pop_category)

In [None]:
df['Pop Group'].value_counts(normalize=False) # True shows percent of total

In [None]:
plt.figure(figsize=(15,8))
#plt.rcParams['figure.figsize'] = [15,8]
ax=df['Pop Group'].value_counts().plot(kind='bar',stacked=True, colormap = 'Set1')
ax.title.set_text('Categories')
plt.xlabel("Population Groups",fontweight = 'bold')
plt.ylabel("Count of Rows",fontweight = 'bold')

### Scatterplots

In [None]:
# Basic scatterplot

plt.scatter(y = df['DIABETES'], x = df['KIDNEY'])
plt.xlabel("KIDNEY",fontweight = 'bold')
plt.ylabel("DIABETES",fontweight = 'bold')
#plt.show()

In [None]:
# Multiple plots (Multiple variables vs one variable)

col = ['BINGE','CSMOKING','LPA','OBESITY','SLEEP']

# Scatter Plot of independent variables vs dependent variables

fig,axes = plt.subplots(2,3,figsize=(18,15))
for seg,col in enumerate(col):
    x,y = seg//3,seg%3
    an=sns.scatterplot(x=col, y='DIABETES' ,data=df, ax=axes[x,y])
    plt.setp(an.get_xticklabels(), rotation=45)
   
plt.subplots_adjust(hspace=0.5)


In [None]:
plt.scatter(y = df['DIABETES'], x = df['KIDNEY'])
plt.xlabel("OBESITY",fontweight = 'bold')
plt.ylabel("DIABETES",fontweight = 'bold')

In [None]:
sns.scatterplot(x='OBESITY', y='DIABETES' ,data=df)

In [None]:
sns.lmplot(x="OBESITY", y="DIABETES", hue="GenderMajority", data=df);


In [None]:
sns.lmplot(x="OBESITY", y="DIABETES", hue="GenderMajority", data=df, markers=["o", "x"]);

In [None]:
# Pairplot 
#sns.pairplot(df)  This will take a long time to run and return an unreadable image because we have so many columns.

In [None]:
# Create a new dataframe

df2 = df[['TotalPopulation','ChildPoverty','MeanCommute','MeanHealthCommute','Unemployment','Unemployment%_2019','Median Household Income_2019',]]

In [None]:
plt.figure(figsize=(8,8))
sns.pairplot(df2)

In [None]:
xyz = df2.corr()
xyz

In [None]:
sns.heatmap(xyz, annot=False)

In [None]:
plt.figure(figsize=(15,15))
corr = df2.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot=True, annot_kws={"size":20}
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
plt.figure(figsize=(20, 20))
corr = df.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot=False, annot_kws={"size":20}
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);