In [None]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats

In [None]:
#Read the service request dataset
nyc_311 =  pd.read_csv('/kaggle/input/nyc311-2010/311_Service_Requests_from_2010_to_Present.csv')

In [None]:
#Check the shape of the data
nyc_311.shape

In [None]:
nyc_311.head()

### Prepare the data

In [None]:
nyc_311.info()

#### The columns 'Created Date' & 'Closed Date' are of datatype 'object'. So, we are converting them to datetime datatype

#### Convert the columns ‘Created Date’ and Closed Date’ to datetime datatype and create a new column ‘Request_Closing_Time’ as the time elapsed between request creation and request closing

In [None]:
#Convert the object type to datetime datatype
nyc_311['Created Date'] = pd.to_datetime(nyc_311['Created Date'])
nyc_311['Closed Date'] = pd.to_datetime(nyc_311['Closed Date'])

In [None]:
#Create a new column 'Request Closing Time', as the time difference between 'Created Date' & 'Closed date'.
nyc_311['Request_Closing_Time'] = (nyc_311['Closed Date'] - nyc_311['Created Date']).dt.total_seconds()

In [None]:
nyc_311.head()

### major insights/patterns

In [None]:
#Get the unique complaint type
nyc_311['Complaint Type'].unique()

In [None]:
#Lets figure out the most common complaint types
complaints_count = nyc_311['Complaint Type'].value_counts()
plt.figure(figsize = (15,10))
complaints_count.plot(kind='bar')
plt.title('Most common complaints',fontsize=25)
plt.show()

In [None]:
#Get the top 10 complaint types
nyc_311['Complaint Type'].value_counts().nlargest(10)

In [None]:
#Get the unique cities 
nyc_311['City'].unique() 

In [None]:
nyc_311['City'].isnull().sum()

In [None]:
#Fill the null values with 'Unknown City'
nyc_311['City'].fillna('Unknown City', inplace =True)

In [None]:
nyc_311['City'].isnull().sum()

In [None]:
#Get the complaint type per city
nyc_311.groupby(['City','Complaint Type']).size()

In [None]:
#Lets figure out the major cities with higher complaints
complaints_per_city = pd.crosstab(index=nyc_311["City"], columns=nyc_311["Complaint Type"])

In [None]:
complaints_per_city

In [None]:
complaints_per_city.plot(kind="bar", figsize=(15,20),stacked=True)
plt.title('Complaints per City',fontsize=20)
plt.show()

#### Its observed that'Brooklyn' has the highest no. of complaints

In [None]:
#Lets analyze the major complaint types in 'Brooklyn'
nyc_311_brooklyn = nyc_311[nyc_311['City'] == 'BROOKLYN']

In [None]:
nyc_311_brooklyn.shape

In [None]:
brooklyn_complaint_count = nyc_311_brooklyn['Complaint Type'].value_counts()
plt.figure(figsize = (15,10))
brooklyn_complaint_count.plot(kind='bar')
plt.title('Most common complaints in Brooklyn',fontsize=25)
plt.show()

#### Order the complaint types based on the average ‘Request_Closing_Time’, grouping them for different locations

In [None]:
nyc_311['Request_Closing_Time'].isnull().sum()

In [None]:
nyc_311['Request_Closing_Time'].fillna(0, inplace =True)

In [None]:
nyc_311['Request_Closing_Time'].isnull().sum()

In [None]:
nyc_311_avg_response_time = nyc_311.groupby(['City','Complaint Type']).Request_Closing_Time.mean()

In [None]:
nyc_311_avg_response_time.head(30)

In [None]:
#Get the response time across complaints
avg_res_time = nyc_311.groupby(['Complaint Type']).Request_Closing_Time.mean().sort_values(ascending=True)

In [None]:
avg_res_time.head(20)

### Perform a statistical test

#### Average response time across complaint types is similar or not (overall)

From the above data, its observed that the average resolution time across the complaint types are not equal.

But, the following complaint types have response time which were too close. 

Traffic                      12304.383994
Disorderly Youth             12363.749206

Null Hypothesis(H0) - Average response time across complaint types are equal
Alternate Hypothesis(H1) - Average response time across complaint types are not equal

### Let's perform one way ANOVA for the above group of complaints

In [None]:
nyc_311_traffic = nyc_311[nyc_311['Complaint Type'] == 'Traffic']
nyc_311_traffic = nyc_311_traffic.loc[:,['Request_Closing_Time']]

In [None]:
nyc_311_traffic.head()

In [None]:
nyc_311_dis_youth = nyc_311[nyc_311['Complaint Type'] == 'Disorderly Youth']
nyc_311_dis_youth = nyc_311_dis_youth.loc[:,['Request_Closing_Time']]

In [None]:
nyc_311_dis_youth.head()

In [None]:
#one way ANOVA for the groups
fvalue, pvalue = stats.f_oneway(nyc_311_traffic,nyc_311_dis_youth)

In [None]:
pvalue

Accept null hypothesis(H0) for complaint types 'Traffic' and 'Disorderly Youth',as the pvalue > 0.05

### Using crosstab and Chi square test to check if the complaints and location are related

In [None]:
nyc_311_city =  pd.crosstab(nyc_311["City"],nyc_311["Complaint Type"])

In [None]:
#import required libraries
from scipy.stats import chi2_contingency

In [None]:
#contingency table
table = nyc_311_city

# Get chi-square value , p-value, degrees of freedom, expected frequencies using the function chi2_contingency
stat, p, dof, expected = chi2_contingency(table)

# select significance value
alpha = 0.05

# Determine whether to reject or keep your null hypothesis
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Variables are associated (reject H0)')
else:
    print('Variables are not associated(fail to reject H0)')