In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
domestic16 = pd.read_csv('domestic_visitors_2016.csv')
domestic17 = pd.read_csv('domestic_visitors_2017.csv')
domestic18 = pd.read_csv('domestic_visitors_2018.csv')
domestic19 = pd.read_csv('domestic_visitors_2019.csv')

## Let's check the data quality

In [6]:
domestic16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   district  372 non-null    object
 1   date      372 non-null    object
 2   month     372 non-null    object
 3   year      372 non-null    int64 
 4   visitors  372 non-null    object
dtypes: int64(1), object(4)
memory usage: 14.7+ KB


### If we can look at the data randomly we'll see some missing values in visitors columns butv the previous cell says there's no missing values, that means those are empty strings

In [7]:
domestic16.sample(20)

Unnamed: 0,district,date,month,year,visitors
273,Ranga Reddy,01-10-2016,October,2016,
218,Nirmal,01-03-2016,March,2016,
113,Khammam,01-06-2016,June,2016,445862.0
23,Bhadradri Kothagudem,01-12-2016,December,2016,326770.0
103,Karimnagar,01-08-2016,August,2016,1018399.0
75,Jogulamba Gadwal,01-04-2016,April,2016,
329,Wanaparthy,01-06-2016,June,2016,
269,Ranga Reddy,01-06-2016,June,2016,
344,Warangal (Rural),01-09-2016,September,2016,
357,Warangal (Urban),01-10-2016,October,2016,128247.0


### Let's look at the next year domestic data
### So as we can see it's the same issue

In [10]:
domestic17.sample(10)

Unnamed: 0,district,date,month,year,visitors
97,Karimnagar,01-02-2017,February,2017,10845.0
45,Jagtial,01-10-2017,October,2017,248504.0
273,Ranga Reddy,01-10-2017,October,2017,
118,Khammam,01-11-2017,November,2017,110800.0
32,Hyderabad,01-09-2017,September,2017,2011280.0
139,Mahabubabad,01-08-2017,August,2017,12520.0
311,Suryapet,01-12-2017,December,2017,0.0
251,Peddapalli,01-12-2017,December,2017,1765.0
99,Karimnagar,01-04-2017,April,2017,8630.0
27,Hyderabad,01-04-2017,April,2017,2366793.0


### We'll just merg all the domestic data together

In [14]:
pd.concat([domestic16,domestic17,domestic18,domestic19],ignore_index=True)

Unnamed: 0,district,date,month,year,visitors
0,Adilabad,01-01-2016,January,2016,792136
1,Adilabad,01-02-2016,February,2016,937820
2,Adilabad,01-03-2016,March,2016,582946
3,Adilabad,01-04-2016,April,2016,341948
4,Adilabad,01-05-2016,May,2016,252887
...,...,...,...,...,...
1507,Yadadri Bhongir,01-08-2019,August,2019,389010
1508,Yadadri Bhongir,01-09-2019,September,2019,366862
1509,Yadadri Bhongir,01-10-2019,October,2019,381860
1510,Yadadri Bhongir,01-11-2019,November,2019,365990


## Save in a single dataframe

In [15]:
domestic = pd.concat([domestic16,domestic17,domestic18,domestic19],ignore_index=True)

## Now we'll do the same thing for the foreign visitors dataframes

In [16]:
foreign16 = pd.read_csv('foreign_visitors_2016.csv')
foreign17 = pd.read_csv('foreign_visitors_2017.csv')
foreign18 = pd.read_csv('foreign_visitors_2018.csv')
foreign19 = pd.read_csv('foreign_visitors_2019.csv')

## As we can see there's some missing values prresent here as well

In [20]:
foreign16.sample(20)

Unnamed: 0,district,date,month,year,visitors
152,Mahbubnagar,01-09-2016,September,2016,74.0
263,Rajanna Sircilla,01-12-2016,December,2016,0.0
324,Wanaparthy,01-01-2016,January,2016,
225,Nirmal,01-10-2016,October,2016,0.0
73,Jogulamba Gadwal,01-02-2016,February,2016,
317,Vikarabad,01-06-2016,June,2016,
49,Jangaon,01-02-2016,February,2016,
125,Komaram Bheem Asifabad,01-06-2016,June,2016,
307,Suryapet,01-08-2016,August,2016,
120,Komaram Bheem Asifabad,01-01-2016,January,2016,


## Let's join them together

In [21]:
foreign = pd.concat([foreign16,foreign17,foreign18,foreign19],ignore_index=True)

In [29]:
domestic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   district  1512 non-null   object
 1   date      1512 non-null   object
 2   month     1512 non-null   object
 3   year      1512 non-null   int64 
 4   visitors  1482 non-null   object
dtypes: int64(1), object(4)
memory usage: 59.2+ KB


In [None]:
domestic

In [31]:
domestic['visitors'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
1507    False
1508    False
1509    False
1510    False
1511    False
Name: visitors, Length: 1512, dtype: bool

In [24]:
domestic.groupby('district')['visitors'].sum()

district
Adilabad                    7921369378205829463419482528873682374475626142...
Bhadradri Kothagudem                 3101332521273267703023152513592439436...
Hyderabad                   1122510778748101779411277381287181120326611096...
Jagtial                              2012492145342072941744771596593236583...
Jangaon                              1477013210126801228012610845097881213...
Jayashankar Bhoopalpally             7636081580854609656096850814009070091...
Jogulamba Gadwal                     1528981881701823332148874811101550131...
Kamareddy                            4738425850414660414140413843414444454...
Karimnagar                  1984442779960101839995695910956429147464010471...
Khammam                     6518274075334585279716224339944458624149635036...
Komaram Bheem Asifabad               0002241249027673074341637963713366539...
Mahabubabad                          1303211401512955132601304591009140102...
Mahbubnagar                 577071131206817319836224325