# DATA VISUALIZATION PROJECT

Statement of Academic Honesty:
The following code represents our own work. We have neither received nor given inappropriate assistance. We have not copied or modified code from any source other than the course webpage or the course textbook. We recognize that any unauthorized assistance or plagiarism will be handled in accordance with Georgia State University's Academic Honesty Policy and the policies of this course. We recognize that our work is based on an assignment created by the Institute for Insight at Georgia State University. Any publishing or posting of source code for this project is strictly prohibited unless you have written consent from the Institute for Insight at Georgia State University.
 

In [None]:
#Importing all the required packages
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from pylab import rcParams
from pandas.api.types import CategoricalDtype
import plotly.graph_objects as go
import plotly.express as px
import datetime
from plotly.subplots import make_subplots
import folium
from folium.plugins import HeatMap
import pandas_profiling

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option("display.float_format",lambda x: "%.5f" % x)
pd.set_option("display.max_columns",None)

from wordcloud import WordCloud, STOPWORDS
import urllib
from PIL import Image
import seaborn as sns
import plotly.express as px
from datetime import datetime


### Reading Data from CSV

In [None]:
#Build dataframe from dataset
df = pd.read_csv('crime.csv', encoding='latin-1')
top5_crime=data=crime=df
crime.head()

### Data Pre-Processing

In [None]:
df = df.drop(['SHOOTING','OFFENSE_CODE'], axis=1)

df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'])

df["DAY_OF_WEEK"] = pd.Categorical(df["DAY_OF_WEEK"], 
              categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],
              ordered=True)

In [None]:
rename = {'OFFENSE_CODE_GROUP':'Group',
          'OFFENSE_DESCRIPTION':'Description',
          'DISTRICT':'District',
          'STREET':'Street',        
          'OCCURRED_ON_DATE':'Date',
          'YEAR':'Year',
          'MONTH':'Month',
          'DAY_OF_WEEK':'Day',
          'HOUR':'Hour'}

df.rename(index=str, columns=rename, inplace=True)

# new feature
def create_features(df):
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','dayofyear',
            'dayofmonth','weekofyear']]
    return X
create_features(df).head()

# CategoricalDytpe
df.quarter    = df.quarter.astype(CategoricalDtype())
df.dayofweek    = df.dayofweek.astype(CategoricalDtype())
df.dayofyear    = df.dayofyear.astype(CategoricalDtype())
df.dayofmonth    = df.dayofmonth.astype(CategoricalDtype())


df.head()

In [None]:
df.apply(pd.Series.nunique)

In [None]:
df.drop_duplicates(subset="INCIDENT_NUMBER", inplace=True)

In [None]:
df = df.dropna(axis=0)
print(df.isnull().sum(),"\nShape:",df.shape)

## Word Cloud

In [None]:
#Using wordcloud to show the types of crime most prominent in Boston

def random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
    h = int(360.0 * 45.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(60, 120)) / 255.0)

    return "hsl({}, {}%, {}%)".format(h, s, l)

file_content=open ("bos_crime.txt",encoding="utf8").read()

homeless_youth = WordCloud(font_path = r'C:\Windows\Fonts\Verdana.ttf',
                            stopwords = STOPWORDS,
                            background_color = 'white'
                            ).generate(file_content)

fig = plt.figure(figsize=(12,8))
plt.imshow(homeless_youth,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#Masking the word cloud in the map of Massachusetts state of US as Boston is the capital of Massachusetts

custom_mask = np.array(Image.open(r'ma.png'))
fig = plt.figure(figsize=(20,15))

In [None]:
stopwordsm = set(STOPWORDS)
stopwordsm.add('Motor')
crime_WC = WordCloud(background_color='Black',
                    max_words = 2000,
                    stopwords = stopwordsm,
                    mask = custom_mask
                          ).generate(file_content)


fig = plt.figure(figsize=(20,15))
plt.imshow(crime_WC,interpolation='bilinear')
plt.axis('off')
plt.show()

### The world cloud shows the types of crimes that have happened in Boston with prominent words depicting the frequency of crime.

In [None]:
# Top 6 most frequently occuring crimes in Boston
z = crime['OFFENSE_CODE_GROUP'].value_counts()
y = pd.DataFrame(z)
x = y.head(7)
x

In [None]:
x.reset_index(level=0, inplace=True)
x

In [None]:
x.rename(columns={"index": "CRIME","OFFENSE_CODE_GROUP": "NUMBER OF INCIDENTS"},inplace=True)
x

In [None]:
w = x.drop([4])
w

## Pie Chart

In [None]:
colors_list = ['#030764','#0000FF','#069AF3','skyblue','#06C2AC','#7FFFD4']

w['NUMBER OF INCIDENTS'].plot(kind='pie',
                            figsize=(15, 6),
                            autopct='%1.1f%%', 
                            startangle=90,         
                            labels=None,
                            pctdistance=1.12,
                            colors=colors_list,
                            shadow=True, 
                            )

plt.title('TOP 6 CRIMES OCCURING FREQUENTLY IN BOSTON')
plt.axis('equal')
plt.legend(labels=w['CRIME'], loc='upper left')

plt.show()

### It shows the percentage contribution of top 6 major crimes in Boston.

In [None]:
# Details of number of crimes depending on UCR (Uniform Crime Reporting) in Boston
e = crime.groupby(['UCR_PART','OFFENSE_CODE_GROUP'])[['OFFENSE_CODE_GROUP']].count().rename(columns = {'OFFENSE_CODE_GROUP':'NUMBER_OF_CRIMES'}).reset_index()
e.head()

## Sunburst in Plotly

In [None]:
px.sunburst(e, values="NUMBER_OF_CRIMES", color="NUMBER_OF_CRIMES", path=["UCR_PART","OFFENSE_CODE_GROUP"], color_continuous_scale = 'orrd')

In [None]:
moter_vehicle_dict ={} # dictionary
larceny_dict = {}
medical_assistance_dict = {}
investigate_person_dict = {}
drug_violation_dict = {}

years = top5_crime["YEAR"].unique()
for year in years :
    moter_vehicle_dict[year]=0
    larceny_dict[year]=0
    medical_assistance_dict[year]=0
    investigate_person_dict[year]=0
    drug_violation_dict[year]=0
for elem in top5_crime[top5_crime["OFFENSE_CODE_GROUP"]=="Motor Vehicle Accident Response"]["YEAR"]:
    if elem in moter_vehicle_dict.keys():
        moter_vehicle_dict[elem] += 1

for elem in top5_crime[top5_crime["OFFENSE_CODE_GROUP"]=="Larceny"]["YEAR"]:
    if elem in larceny_dict.keys():
        larceny_dict[elem] += 1
        
for elem in top5_crime[top5_crime["OFFENSE_CODE_GROUP"]=="Medical Assistance"]["YEAR"]:
    if elem in medical_assistance_dict.keys():
        medical_assistance_dict[elem] += 1
        
for elem in top5_crime[top5_crime["OFFENSE_CODE_GROUP"]=="Investigate Person"]["YEAR"]:
    if elem in investigate_person_dict.keys():
        investigate_person_dict[elem] += 1
        
for elem in top5_crime[top5_crime["OFFENSE_CODE_GROUP"]=="Drug Violation"]["YEAR"]:
    if elem in drug_violation_dict.keys():
        drug_violation_dict[elem] += 1
        
        
# # Let's order the above dictionaries for proper plotting
years=[2015,2016,2017,2018]
moter_vehicle_list = [(k,moter_vehicle_dict[k]) for k in years]
larceny_list = [(k,larceny_dict[k]) for k in years]
medical_assistance_list = [(k,medical_assistance_dict[k]) for k in years]
investigate_person_list = [(k,investigate_person_dict[k]) for k in years]
drug_violation_list = [(k,drug_violation_dict[k]) for k in years]



In [None]:
# Plotting the graphs

plt.style.use('seaborn-dark')
#sns.set(rc={'axes.facecolor':'A0D7E6', 'figure.facecolor':'ffffff'})
fig, ax = plt.subplots(figsize=(10,10))

ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)    
ax.spines["left"].set_visible(False)  

# Setting the ticks only on the bottom and the left of the graph
ax.get_xaxis().tick_bottom()    
ax.get_yaxis().tick_left()   

labels = [2015,2016,2017,2018]
plt.xticks(labels, fontsize=14)
plt.yticks(fontsize=14)

plt.ylim(500, 15000)

x = [z[0] for z in moter_vehicle_list]
# print(x)
y = [z[1] for z in moter_vehicle_list]
ax.plot(x,y, color="black",label='Moter Vehicle Accident Response')
ax.lines[0].set_linestyle("--")

x = [z[0] for z in larceny_list]
y = [z[1] for z in larceny_list]
ax.plot(x,y, color="red",label='Larceny')
ax.lines[1].set_linestyle("--")

x = [z[0] for z in medical_assistance_list]
y = [z[1] for z in medical_assistance_list]
ax.plot(x,y, color="blue",label='Medical Assistance')
ax.lines[2].set_linestyle("--")

x = [z[0] for z in investigate_person_list]
y = [z[1] for z in investigate_person_list]
ax.plot(x,y, color="orange",label='Investigate Person')
ax.lines[3].set_linestyle("--")

# x,y = zip(*dec_prac.items())
x = [z[0] for z in drug_violation_list]
y = [z[1] for z in drug_violation_list]
ax.plot(x,y, color="green",label='Drug Violation')
ax.lines[4].set_linestyle("--")


for tick in ax.get_xticklabels():
    tick.set_rotation(90)
    
plt.text(2018,8000,"Moter Vehicle Accident Response",fontsize=10)
plt.text(2018,5000,"Larceny",fontsize=10,color="red")
plt.text(2018,5500,"Medical Assistance",fontsize=10,color="blue")
plt.text(2018,3700,"Investigate Person",fontsize=10,color="orange")
plt.text(2018,3000,"Drug Violation",fontsize=10,color="green")

ax.set_title("Frequency of Most Occuring Top 5 Crimes\n",fontname="monospace", fontsize=10)
ax.set_xlabel("Year", fontsize=18)
ax.set_ylabel("Number of Crimes\n", fontsize=16)

    
plt.show()


### 1. Motor Vehicle Accident Response has the highest number of incidents over the yeats 2015-2018
### 2. All offences were at their peak during 2016-2017
### 3. Investigate person and drug violation has a smoother curve as compared to others

## Crime Vs Time

In [None]:
top5_crime.loc[top5_crime['MONTH'] == 1, 'MONTH'] = 'January'
top5_crime.loc[top5_crime['MONTH'] == 2, 'MONTH'] = 'February'
top5_crime.loc[top5_crime['MONTH'] == 3, 'MONTH'] = 'March'
top5_crime.loc[top5_crime['MONTH'] == 4, 'MONTH'] = 'April'
top5_crime.loc[top5_crime['MONTH'] == 5, 'MONTH'] = 'May'
top5_crime.loc[top5_crime['MONTH'] == 6, 'MONTH'] = 'June'
top5_crime.loc[top5_crime['MONTH'] == 7, 'MONTH'] = 'July'
top5_crime.loc[top5_crime['MONTH'] == 8, 'MONTH'] = 'August'
top5_crime.loc[top5_crime['MONTH'] == 9, 'MONTH'] = 'September'
top5_crime.loc[top5_crime['MONTH'] == 10, 'MONTH'] = 'October'
top5_crime.loc[top5_crime['MONTH'] == 11, 'MONTH'] = 'November'
top5_crime.loc[top5_crime['MONTH'] == 12, 'MONTH'] = 'December'

In [None]:
plt.style.use('ggplot')
sns.set_context('notebook')

# Code to plot
sns.countplot(y='MONTH', data=top5_crime,order=['January', 'February', 'March', 'April', 'May', 'June', "July", 'August', 'September', 'October','November','December'])

# Aesthetic appeal of the plot 
plt.title("Crimes rise during Summer !", fontdict={'fontsize': 40, 'color': '#DF0D0D', 'fontname':'Agency FB'}, weight="bold")
plt.ylabel("Month\n", fontdict={'fontsize': 20}, weight="bold", color="#833636")
plt.xlabel("\nNumber of Crimes", fontdict={'fontsize': 20}, weight="bold", color="#833636")

plt.xticks(fontsize=15,color='black')
plt.yticks(fontsize=15, color='black')
plt.show()

### When temperatures are extremely cold or hot, people stay indoors. But as temperatures become more comfortable, more people are outdoors, which presents greater opportunities for crime.

In [None]:
# Set plot style
plt.style.use('seaborn-dark')
sns.set_context('paper')

# Write code to plot
fig, ax = plt.subplots(figsize=(10, 5))
sns.countplot(x='HOUR', data=top5_crime, palette="viridis")

# Aesthetic appeal
plt.title("Unsafest Hours in Boston", fontdict={'fontsize': 40, 'color': '#bb0e14','fontname':'Agency FB'}, weight="bold")
plt.xlabel("\nHour in the Day", fontdict={'fontsize': 15}, weight='bold')
plt.ylabel("Number of Crimes\n", fontdict={'fontsize': 15}, weight="bold")

# Add Text to the plot
plt.text(2.5, 5500, 'Lowest Crime Rate', fontdict={'fontsize': 10, 'color':"blue" }, weight='bold')

plt.show()

## Criminals need to sleep too

In [None]:
plt.style.use('ggplot')
# An analysis of the 24 hour pattern for crimes
# c = input("Enter the crime you wish to see the 24 hour pattern off >> ")

crime_df = top5_crime[top5_crime['OFFENSE_CODE_GROUP']== 'Motor Vehicle Accident Response']
hours = [int(x) for x in list(crime_df['HOUR'].unique())]
hours = sorted(hours)
# print(hours)

h_cri = list(crime_df['HOUR'].value_counts().sort_index())
# print(bur_cri)

fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(x=hours, y=h_cri, palette='inferno')

# Aesthetic appeal
tit = "Motor Vehicle Accident over 24 Hours"
plt.title(tit, fontdict={'fontsize': 40, 'color': '#bb0e14','fontname':'Agency FB'}, weight="bold")
plt.xlabel("\nHour in the Day", fontdict={'fontsize': 20}, weight='bold')
plt.ylabel("Number of Crimes\n", fontdict={'fontsize': 20}, weight="bold")
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

# show plot
plt.show()

### Maximum between 4pm to 6pm because there are more vehicles on the road due to office hour rush

In [None]:
plt.style.use('ggplot')
# An analysis of the 24 hour pattern for crimes
# c = input("Enter the crime you wish to see the 24 hour pattern off >> ")

crime_df = top5_crime[top5_crime['OFFENSE_CODE_GROUP']== 'Towed']
hours = [int(x) for x in list(crime_df['HOUR'].unique())]
hours = sorted(hours)
# print(hours)

h_cri = list(crime_df['HOUR'].value_counts().sort_index())
# print(bur_cri)

fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(x=hours, y=h_cri, palette='inferno')

# Aesthetic appeal
tit = "Towed over 24 Hours"
plt.title(tit, fontdict={'fontsize': 40, 'color': '#bb0e14','fontname':'Agency FB'}, weight="bold")
plt.xlabel("\nHour in the Day", fontdict={'fontsize': 20}, weight='bold')
plt.ylabel("Number of Crimes\n", fontdict={'fontsize': 20}, weight="bold")
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)

# show plot
plt.show()

### Cars get towed in between 7am to 10am because street sweeping is done during morning time

In [None]:
# Crimes by day of the week
sns.catplot(x='DAY_OF_WEEK',
           kind='count',
            height=8, 
            aspect=3,
           data=data)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel(' ',fontsize=40)
plt.ylabel('Number of Crimes', fontsize=40)

### Friday has the highest crime rate. It appears that the high number of crimes committed on Friday was triggered by the fact that many people were out on Friday night due to Saturday being a holiday. Other days do not have a significant variance but Sunday has the lowest rate. The fact that most of the people spend Sundays at home may be a factor that reduces crime

In [None]:
# Crimes by Year upon year
sns.catplot(x='YEAR',
           kind='count',
            height=8, 
            aspect=3,
           data=data)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('Year ',fontsize=40)
plt.ylabel('Number of Crimes', fontsize=40)

Below contribution is from Abhishek Bhagat

## Distribution of top three crimes on Boston Map 

**Motor Vehicle Accident Response**

In [None]:
map_hooray = folium.Map(location=[42.361145,-71.057083],
                    zoom_start = 12, min_zoom=12) 

heat_df = df[df['Year']==2017]
heat_df =df[df['Group']=='Motor Vehicle Accident Response']
heat_df = df[['Lat','Long']]


heat_data = [[row['Lat'],row['Long']] for index, row in df.iterrows()]
HeatMap(heat_data, radius=10).add_to(map_hooray)
map_hooray

**Medical Assistance**

In [None]:
map_hooray = folium.Map(location=[42.361145,-71.057083],
                    zoom_start = 10, min_zoom=12) 

heat_df = df[df['Year']==2017]
heat_df =df[df['Group']=='Medical Assistance']
heat_df = df[['Lat','Long']]


heat_data = [[row['Lat'],row['Long']] for index, row in df.iterrows()]
HeatMap(heat_data, radius=10).add_to(map_hooray)
map_hooray

**Larceny**

In [None]:
map_hooray = folium.Map(location=[42.361145,-71.057083],
                    zoom_start = 12, min_zoom=12) 

heat_df = df[df['Year']==2016]
heat_df =df[df['Group']=='Larceny']
heat_df = df[['Lat','Long']]


heat_data = [[row['Lat'],row['Long']] for index, row in df.iterrows()]
HeatMap(heat_data, radius=10).add_to(map_hooray)
map_hooray

**Distribution of all the crimes in the Boston**

Below contribution is from Kunal Jha

In [None]:
my = data.dropna()
df_counters = pd.DataFrame(
    {'ID' : id,
     'Name' : my.OFFENSE_CODE_GROUP,
     'lat' : my.Lat,
     'long' : my.Long,
     'region' : my.DISTRICT,
     'year': my.YEAR,
     'month': my.MONTH
    })

arrayName = []
for i in my.OFFENSE_CODE_GROUP:
    arrayName.append(i)
df_counters.head()
locations = df_counters[['lat', 'long']]
locationlist = locations.values.tolist()
BostonMap=folium.Map(location=[42.36,-71.05],zoom_start=11)
for point in range(0, len(locationlist)):
    string = arrayName[point]
    folium.Marker(locationlist[point], popup=string).add_to(BostonMap)
BostonMap

# The END 