# Codebook

In [1]:
import pandas as pd
import numpy as np 
import os
from collections import Counter
import matplotlib.pyplot as plt
import statistics
import boto3

# Districts-07-.csv

#### About this dataset

Data provided by: 

Source: https://daanmatchdatafiles.s3-us-west-1.amazonaws.com/DaanMatch_DataFiles/Districts-07-.csv

Type: csv

Last Modified: May 29, 2021, 19:54:14 (UTC-07:00)

Size: 212.0 B

In [2]:
client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket('my-bucket')

In [4]:
#path = "/Users/laurenbaker/Downloads/Districts-07-.csv"
path = "s3://daanmatchdatafiles/DaanMatch_DataFiles/Districts-07-.csv"
Districts_07 = pd.read_csv(path)
Districts_07

Unnamed: 0,KeyColumn,Name,Value
0,90,North,90
1,90,North West,90
2,92,North East,92
3,93,East,93
4,94,New Delhi,94
5,95,Central,95
6,96,West,96
7,97,South West,97
8,98,South,98
9,702,Shahdara,702


#### What's in the dataset?

In [8]:
print("Shape:", Districts_07.shape)
print("Rows:", Districts_07.shape[0])
print("Columns:", Districts_07.shape[1])
print("Each row is a District in Delhi.")

Shape: (11, 3)
Rows: 11
Columns: 3
Each row is a District in Delhi.


#### Codebook

In [9]:
Districts_07_columns = [column for column in Districts_07.columns]
Districts_07_description = ["Same as the Value column.",
                            "Name of District in the state of Delhi.",
                            "Value of the District."]
Districts_07_dtypes = [dtype for dtype in Districts_07.dtypes]

data = {"Column Name": Districts_07_columns, "Description": Districts_07_description, "Type": Districts_07_dtypes}
Districts_07_codebook = pd.DataFrame(data)
Districts_07_codebook.style.set_properties(subset=['Description'], **{'width': '600px'})

Unnamed: 0,Column Name,Description,Type
0,KeyColumn,Same as the Value column.,int64
1,Name,Name of District in the state of Delhi.,object
2,Value,Value of the District.,int64


#### Missing values

In [10]:
Districts_07.isnull().sum()

KeyColumn    0
Name         0
Value        0
dtype: int64

#### Summary statistics

In [11]:
Districts_07.describe()

Unnamed: 0,KeyColumn,Value
count,11.0,11.0
mean,204.545455,204.545455
std,246.209002,246.209002
min,90.0,90.0
25%,92.5,92.5
50%,95.0,95.0
75%,97.5,97.5
max,703.0,703.0


## Columns

### Name

The name column is the name of each district in the state of Delhi in India.

In [12]:
column = Districts_07["Name"]
column

0          North
1     North West
2     North East
3           East
4      New Delhi
5        Central
6           West
7     South West
8          South
9       Shahdara
10    South East
Name: Name, dtype: object

In [13]:
print("No. of unique values:", len(column.unique()))

# Check for duplicates
counter = dict(Counter(column))
duplicates = { key:value for key, value in counter.items() if value > 1}
print("Duplicates:", duplicates)

No. of unique values: 11
Duplicates: {}


### Value

The value column represents the value of each district in Delhi. (Cara is looking into what these values mean exactly.)

In [14]:
column = Districts_07["Value"]
column

0      90
1      90
2      92
3      93
4      94
5      95
6      96
7      97
8      98
9     702
10    703
Name: Value, dtype: int64

In [15]:
print("No. of unique values:", len(column.unique()))

# Check for duplicates
counter = dict(Counter(column))
duplicates = { key:value for key, value in counter.items() if value > 1}
print("Duplicates:", duplicates)
if len(duplicates) > 0:
    print("No. of duplicates:", len(duplicates))

No. of unique values: 10
Duplicates: {90: 2}
No. of duplicates: 1


In [16]:
Districts_07[Districts_07['Value'] == 90]

Unnamed: 0,KeyColumn,Name,Value
0,90,North,90
1,90,North West,90
