# Codebook  
**Authors:** Lauren Baker 
Documenting existing data files of DaanMatch with information about location, owner, "version", source etc.

In [2]:
import boto3
import numpy as np 
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import statistics

In [3]:
client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket('daanmatchdatafiles')

# CSR 2016_2017.xlsx

## TOC:
* [About this dataset](#1)
* [Sheet 1](#2)
    * [What's in this dataset](#2.1)
    * [Codebook](#2.2)
        * [Missing values](#2.2.1)
        * [Summary statistics](#2.2.2)
    * [Columns](#2.3)
        * [CIN](#2.3.1)
        * [COMPANY_NAME](#2.3.2)
* [Sheet 2](#3)
    * [What's in this dataset](#3.1)
    * [Codebook](#3.2)
        * [Missing values](#3.2.1)
        * [Summary statistics](#3.2.2)
    * [Columns](#3.3)
        * [CIN](#3.3.1)
        * [COMPANY_NAME](#3.3.2)
* [Sheet 3](#4)

**About this dataset**  <a class="anchor" id="1"></a>  
Data provided by: Unknown.  
Source: https://daanmatchdatafiles.s3-us-west-1.amazonaws.com/DaanMatch_DataFiles/CSR+2016_2017.xlsx  
Type: xlsx  
Last Modified: May 29, 2021, 19:52:24 (UTC-07:00)  
Size: 487.4 KB

In [4]:
path = "s3://daanmatchdatafiles/DaanMatch_DataFiles/CSR 2016_2017.xlsx"
CSR_2016_2017 = pd.ExcelFile(path)
print(CSR_2016_2017.sheet_names)

['Sheet1', 'Sheet2', 'Sheet3']


In [5]:
# Show dataframe 1
CSR_2016_2017_1 = CSR_2016_2017.parse('Sheet1')
CSR_2016_2017_1.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,,,,,,,,,,,,,,,,,
1,,,AS Prescribed,,,,Less than Prescribed,,,,More than Prescribed,,,,,,,
2,,Contributing Companies,,,,Contributing Companies,,,,,,,,,,,,"The Ministry of Corporate Affairs (""MCA""), vid..."
3,,S.No.,Company Name(s),Amount (Actuals),,S.No.,Company Name(s),Amount (Actuals),,S.No.,Company Name(s),Amount (Actuals),,,,,,
4,,1,India Infrastructure Finance Company Limited,244600000,,1,Tata Consultancy Services Limited,2802200000,,1,Reliance Industries Limited,6397000000,,,,,,Present Corporate Social Responsibility Norms ...


In [6]:
# Show dataframe 2
CSR_2016_2017_2 = CSR_2016_2017.parse('Sheet2')
CSR_2016_2017_2.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,,Development Sector-wise,,,,
2,,Development Sectors,,Amount Spent FY 2014-15 (INR Cr.),Amount Spent FY 2015-16 (INR Cr.),Amount Spent FY 2016-17 (INR Cr.)
3,,1,Clean Ganga Fund,5.47,32.65,24.23
4,,2,"Education, Differently Abled, Livelihood",3188.09,4881.26,5123.83


In [7]:
# Show dataframe 3
CSR_2016_2017_3 = CSR_2016_2017.parse('Sheet3')
CSR_2016_2017_3.head()

## Sheet 1
<a class="anchor" id="2"></a>

Cleaning up the data in sheet 1:

In [8]:
CSR_2016_2017_1.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,,,,,,,,,,,,,,,,,,
1,,,AS Prescribed,,,,Less than Prescribed,,,,More than Prescribed,,,,,,,
2,,Contributing Companies,,,,Contributing Companies,,,,,,,,,,,,"The Ministry of Corporate Affairs (""MCA""), vid..."
3,,S.No.,Company Name(s),Amount (Actuals),,S.No.,Company Name(s),Amount (Actuals),,S.No.,Company Name(s),Amount (Actuals),,,,,,
4,,1,India Infrastructure Finance Company Limited,244600000,,1,Tata Consultancy Services Limited,2802200000,,1,Reliance Industries Limited,6397000000,,,,,,Present Corporate Social Responsibility Norms ...


In [37]:
as_prescribed = CSR_2016_2017_1.copy()
as_prescribed.rename(columns = {'Unnamed: 1' : 'S.No.', 'Unnamed: 2' : 'Company Name(s)', 'Unnamed: 3' : 'Amount (Actuals)'}, inplace = True)
as_prescribed = as_prescribed.iloc[4:401, 1:4]
as_prescribed = as_prescribed.reset_index(drop = True)
as_prescribed

Unnamed: 0,S.No.,Company Name(s),Amount (Actuals)
0,1,India Infrastructure Finance Company Limited,244600000
1,2,Tata Communications Limited,138500000
2,3,Infosys Bpo Limited,135800000
3,4,Gujarat State Fertilizers & Chemicals Limited,122200000
4,5,Havells India Limited,114800000
...,...,...,...
392,393,Aak Kamani Private Limited,0
393,394,Bombay Oxygen Corporation Limited,0
394,395,20 Microns Limited,0
395,396,Hindustan Coca Cola Holdings Private Limited,0


In [30]:
less_prescribed = CSR_2016_2017_1.copy()
less_prescribed.rename(columns = {'Unnamed: 5' : 'S.No.', 'Unnamed: 6' : 'Company Name(s)', 'Unnamed: 7' : 'Amount (Actuals)'}, inplace = True)
less_prescribed = less_prescribed.iloc[4:, 5:8]
less_prescribed = less_prescribed.reset_index(drop = True)
less_prescribed

Unnamed: 0,S.No.,Company Name(s),Amount (Actuals)
4,1,Tata Consultancy Services Limited,2802200000
5,2,Infosys Limited,2023000000
6,3,Icici Bank Limited,1715100000
7,4,Axis Bank Limited,1477800000
8,5,Housing Development Finance Corporation Limited,1465400000
...,...,...,...
6812,6809,Madaus Pharmaceuticals Private Limited,0
6813,6810,Yahoo India Private Limited,0
6814,6811,Apeejay Shipping Ltd,0
6815,6812,Parker Multi-Commodities (India) Private Limited,0


In [36]:
more_prescribed = CSR_2016_2017_1.copy()
more_prescribed.rename(columns = {'Unnamed: 9' : 'S.No.', 'Unnamed: 10' : 'Company Name(s)', 'Unnamed: 11' : 'Amount (Actuals)'}, inplace = True)
more_prescribed = more_prescribed.iloc[4:3732, 9:12]
more_prescribed = more_prescribed.reset_index(drop = True)
more_prescribed

Unnamed: 0,S.No.,Company Name(s),Amount (Actuals)
0,1,Reliance Industries Limited,6397000000
1,2,Ntpc Limited,4918000000
2,3,Oil And Natural Gas Corporation Limited,4089900000
3,4,Itc Limited,2475000000
4,5,Central Coalfields Limited,2146000000
...,...,...,...
3723,3724,Highend Properties Private Limtied,0
3724,3725,P.C. Chandra Jewellery Apex Private Limited,0
3725,3726,Siro Clinpharm Private Limited,0
3726,3727,Divgi Torqtransfer Systems Private Limited,0
