In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Task 1: Import a 311 NYC service request
data=pd.read_csv("../input/nyc-311-customer-service-requests-analysis/NYC311data.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.isna().sum()

In [None]:
data[data['Complaint Type']=='Animal in a Park']

In [None]:
data.drop(labels=283132, axis=0, inplace=True)

In [None]:
# Task 2: Read or convert the columns ‘Created Date’ and Closed Date’ to datetime datatype and create a new column ‘Request_Closing_Time’ 
#as the time elapsed between request creation and request closing.

In [None]:
data['Created Date']= pd.to_datetime(data['Created Date'])

In [None]:
data['Closed Date']= pd.to_datetime(data['Closed Date'])

In [None]:
data['Request_Closing_Time']= data['Closed Date'].values-data['Created Date'].values

In [None]:
data['Request_Closing_Time_mins'] = data['Request_Closing_Time']/np.timedelta64(1,'m')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# Provide major insights/patterns that you can offer in a visual format (graphs or tables); 
#at least 4 major conclusions that you can come up with after generic data mining.

In [None]:
data['Complaint Type'].unique()

In [None]:
# Conclusion 1: City with maximum number of complaints was found to be Brooklyn with different types of complaints
df1=data.groupby(['City','Complaint Type']).size().unstack().fillna(0)

In [None]:
df1.plot.bar(figsize=(15,10), stacked=True)
plt.ylabel('Number of Complaints')
plt.title('Number of complaints vs. City')

In [None]:
data.Status.unique()

In [None]:
df2=data.groupby(['City','Status']).size().unstack().fillna(0)

In [None]:
# Conclusion 2: Based upon the status of complaints it can be inferred that maximum number of complaints were resolved.
# From the total unresolved cases the top 5 cities have been found which sum upto 76.92 % of total Open cases
df2.sort_values(by='Open', ascending=False).head() # Thus maximum pending cases are in New york

In [None]:
df2['Unresolved_percentage']= df2['Open']/df2['Open'].sum()*100

In [None]:
df2.sort_values(by='Unresolved_percentage', ascending=False).head().sum()

In [None]:
# Conclusion 3: Since maximum number of complaints were filed in Brooklyn, complaint data was analysed for this city.
# Blocked Driveway was found to be the most frequent complaint lodged to NYPD in the city of Brooklyn.
data.loc[(data['City']=='BROOKLYN'),:]['Complaint Type'].value_counts()
plot_3= sns.countplot(x=data.loc[data.City=='BROOKLYN']['Complaint Type'], palette='YlOrRd_r')
plot_3.set_xticklabels(plot_3.get_xticklabels(), rotation=90)

In [None]:
# Conclusion 4: It was found that al the complaints were lodged to New York Police Department
df3=data.groupby(['Agency Name','Complaint Type']).size().unstack().head(50)
df3.plot.bar(figsize=(8,8))

In [None]:
data['Location Type'].unique()

In [None]:
data['Location Type'].value_counts()

In [None]:
# Conclusion 5: The maximum complaints were regarding Street and Sidwalk which indicate that 
# there is a scope for improvement in strict measures on the road thus Traffic police can look into this matter
data['Location Type'].fillna(value='Street/Sidewalk',inplace=True)

In [None]:
plt.figure(figsize=(10,5))
plot_4 =sns.countplot(data['Location Type'])
plot_4.set_xticklabels(plot_4.get_xticklabels(), rotation=90)

In [None]:
# Conclusion 6: Major Complaint types and their count
df4= data['Complaint Type'].value_counts().head(10)
df4.nlargest().index

In [None]:
df4

In [None]:
plt.figure(figsize=(5,5))
plot_5= sns.barplot(x=df4.nlargest().index, y= df4.nlargest().values)
plot_5.set_xticklabels(plot_5.get_xticklabels(), rotation=90)

In [None]:
data.Request_Closing_Time_mins.head()

In [None]:
# Conclusion 7: Average resolving time of a compalaint was observed for different Boroughs
plt.figure(figsize=(8,7))
sns.barplot(x='Borough', y='Request_Closing_Time_mins', data=data)
plt.title('Average Request Closing Time for Boroughs')

In [None]:
# Conclusion 7: Average Request Closing Time for various Compaints
df7=data[['Complaint Type', 'Request_Closing_Time_mins']]
df8=df7.groupby('Complaint Type')['Request_Closing_Time_mins'].mean().fillna(0).to_frame()

In [None]:
df8.head()

# df8.head()

In [None]:
df8['Complaint Type']=df8.index
plt.figure(figsize=(10,5))
sns.barplot(x='Complaint Type', y='Request_Closing_Time_mins', data=df8.sort_values('Request_Closing_Time_mins'))
plt.xticks(rotation=90)

In [None]:
# Task 3: Order the complaint types based on the average ‘Request_Closing_Time’, grouping them for different locations

In [None]:
df9 = data.groupby(['City','Complaint Type'])['Request_Closing_Time_mins'].mean()
df9.unstack().fillna(0).head()

In [None]:
# Task 5: Perform a statistical test for the following:
# Please note: For the below statements you need to state the Null 
# and Alternate and then provide a statistical test to accept or reject the Null Hypothesis along with the corresponding ‘p-value’.
# Whether the average response time across complaint types is similar or not (overall)
# Are the type of complaint or service requested and location related?

In [None]:
data['Complaint Type'].unique()

In [None]:
# Since Blocked Driveway is the most frequent complaint I shall analyze that data
df10= data[data['Complaint Type']== 'Blocked Driveway']['Request_Closing_Time_mins']
df10.hist(range=(0,1500)) 
# The data is left skewed, needs to be converted to gaussian

In [None]:
df11= data[data['Complaint Type']== 'Noise - Street/Sidewalk']['Request_Closing_Time_mins']
df11.hist(range=(0,1500)) # Simiar result

In [None]:
# Applying log transformation
dataset={}
for i in data['Complaint Type'].unique():
    dataset[i]= np.log(data[data['Complaint Type']==i]['Request_Closing_Time_mins'])

In [None]:
dataset.keys()

In [None]:
dataset['Blocked Driveway'].hist()

In [None]:
dataset['Noise - Street/Sidewalk'].hist()

In [None]:
# ANOVA Analysis (Checking for top 5 complaints)
#1. Null Hypothesis: The average response time across complaint types is not different
# Alternate Hypothesis: The average response time across complaint types is different

In [None]:
from scipy.stats import f_oneway
stat,p = f_oneway(dataset['Noise - Street/Sidewalk'], dataset['Blocked Driveway'], dataset['Illegal Parking'], dataset['Derelict Vehicle'], 
                dataset['Noise - Commercial'])

alpha=0.05
if p>0.05:
    print('Null Hypothesis is accepted')
else:
    print('Null hypothesis is rejected')

In [None]:
# 2.Are the type of complaint or service requested and location related?

In [None]:
df11= data[['Complaint Type','Location','Latitude','Longitude','City','Borough']]

In [None]:
df11.head()

In [None]:
df11['Complaint Type']=df11['Complaint Type'].astype('category').cat.codes
df11['City']= df11['City'].astype('category').cat.codes
df11['Borough']= df11['Borough'].astype('category').cat.codes

In [None]:
df11.head()

In [None]:
df11.corr(method='pearson') # From the first line it can be seen that the complaint types does not depend upon the location