In [None]:
# Import important libraries
import numpy as np
import pandas as pd
import missingno as msno 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/all-indian-companies-registration-data-1900-2019/registered_companies.csv', sep=',')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print("Number of rows and columns in the data frame: ",df.shape)


In [None]:
df.columns

In [None]:
# Check datatypes of all columns
df.info()

In [None]:
# Check statistical values
df.describe()

##### Missing values

In [None]:
# Lets check missing values in every columns 
msno.bar(df)

In [None]:
msno.matrix(df)

In [None]:
df.isnull().sum()

In [None]:
# let's drop the last two columns since almost half the entries are missing in those columns
df.drop(['LATEST_YEAR_ANNUAL_RETURN','LATEST_YEAR_FINANCIAL_STATEMENT'], axis=1, inplace=True)

In [None]:
df.INDUSTRIAL_CLASS = df.INDUSTRIAL_CLASS.fillna(method='ffill').fillna(method='bfill')

In [None]:
# Drop all rows with missing values
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

#### Univariate Analysis

Lets check all the columns one by one

##### CORPORATE_IDENTIFICATION_NUMBER

In [None]:
df.CORPORATE_IDENTIFICATION_NUMBER.value_counts()

Looks like CORPORATE_IDENTIFICATION_NUMBER are all good and unique

##### COMPANY_NAME

In [None]:
df.COMPANY_NAME.unique()

In [None]:
# There are some whitespaces, let's remove them first
df.COMPANY_NAME = df.COMPANY_NAME.str.rstrip()

##### COMPANY_STATUS

In [None]:
df.COMPANY_NAME.unique()

In [None]:
df.COMPANY_STATUS.unique()

This column already looks good

##### COMPANY_STATUS

In [None]:
df.COMPANY_CLASS.unique()

In [None]:
# COMPANY_CLASS is either be Public or Private
df.COMPANY_CLASS = df.COMPANY_CLASS.apply(lambda x: 'Private' if x=='Private(One Person Company)' else x)

In [None]:
df.COMPANY_CLASS.unique()

###### COMPANY_CATEGORY

In [None]:
df.COMPANY_CATEGORY.unique()

This columns is looks good

##### COMPANY_SUB_CATEGORY

In [None]:
df.COMPANY_SUB_CATEGORY.unique()

This columns looks good too

##### DATE_OF_REGISTRATION

In [None]:
# First lets change it into datetime format
df.DATE_OF_REGISTRATION = pd.to_datetime(df.DATE_OF_REGISTRATION)

In [None]:
df.DATE_OF_REGISTRATION

Let's make some new features like month and year seperately

In [None]:
df['MONTH'] = df.DATE_OF_REGISTRATION.dt.month
df['YEAR'] = df.DATE_OF_REGISTRATION.dt.year

##### REGISTERED_STATE

let's change the states to their respective codes

In [None]:
df.REGISTERED_STATE.unique()

In [None]:
state_codes = {'Gujarat': "GJ", 'Karnataka': 'KA', 'Rajasthan':'RJ', 'Madhya Pradesh':'MP',
       'Uttaranchal':'UK', 'Assam':'AS', 'Jharkhand':'JH', 'Tamil Nadu':'TN', 'Delhi':'DL',
       'Maharashtra':'MH', 'Haryana':'HR', 'Chattisgarh':'CG', 'Daman and Diu':'DD',
       'West Bengal':'WB', 'Lakshadweep':'LD', 'Himachal Pradesh':'HP',
       'Dadra and Nagra Haveli':'DH', 'Kerala':'KL', 'Pondicherry':'PY',
       'Jammu and Kashmir':'JK', 'Bihar':'BR', 'Andhra Pradesh':'AP',
       'Arunachal Pradesh':'AR', 'Uttar Pradesh':'UP', 'Telangana':'TS', 'Chandigarh':'CH',
       'Punjab':'PB', 'Orissa':'OR', 'Goa':'GA', 'Meghalaya':'ML', 'Tripura':'TR', 'Sikkim':'SK',
       'Mizoram':'MZ', 'Manipur':'MN', 'Andaman and Nicobar Islands':'AN', 'Nagaland':'NL'}

In [None]:
df.replace({'REGISTERED_STATE':state_codes}, inplace=True)

##### REGISTERED_OFFICE_ADDRESS

In [None]:
# There are some whitespaces in Registered office address columns. Let's clear those out.
df.REGISTERED_OFFICE_ADDRESS = df.REGISTERED_OFFICE_ADDRESS.str.strip()

In [None]:
df.REGISTERED_OFFICE_ADDRESS.unique()

##### REGISTRAR_OF_COMPANIES

In [None]:
# Looks like there are non breaking spaces in REGISTRAR_OF_COMPANIES

In [None]:
df.REGISTRAR_OF_COMPANIES.unique()

In [None]:
df['REGISTRAR_OF_COMPANIES'] = df['REGISTRAR_OF_COMPANIES'].str.replace('\xa0', ' ')

In [None]:
df.REGISTRAR_OF_COMPANIES.unique()

In [None]:
df.info()