# Data Collection

Website: https://en.wikipedia.org/wiki/List_of_largest_cities_and_towns_in_Turkey

## Import libraries

In [2]:
try:
    from bs4 import BeautifulSoup
    import pandas
    print ("BeautifulSoup and Pandas are already installed and imported")
except:
    import sys
    !conda install --yes --prefix {sys.prefix} bs4
    !conda install --yes --prefix {sys.prefix} pandas
    from bs4 import BeautifulSoup
    import pandas
    print ("Either BeautifulSoup or Pandas were not found. Installed them and imported")

import requests

BeautifulSoup and Pandas are already installed and imported


## Request web page and parse HTML tree

In [3]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_largest_cities_and_towns_in_Turkey')
print('HTML doc requested successfully')

bs = BeautifulSoup(response.content, 'html.parser')
print('HTML tree parsed successfully')

HTML doc requested successfully
HTML tree parsed successfully


# Data Extraction

In [10]:
# extract the table from the html tree
table = bs.select('table:nth-of-type(2)')
raw_data = table[0].text

print(raw_data)



Rank

City or Town

1990 Census

2000 Census

2007 Estimate

2008 Estimate

2015 Estimate

Province (İl)


1
Istanbul
6,629,431
8,803,468
10,861,463
10,878,360
14,025,646
Istanbul


2
Ankara
2,583,963
3,203,362
3,842,737
3,894,182
4,587,558

Ankara


3
İzmir
1,758,780
2,232,265
2,644,531
2,672,126
2,847,691
İzmir


4
Bursa
834,576
1,194,687
1,537,040
1,589,530
1,854,285
Bursa


5
Adana
916,150
1,130,710
1,506,272
1,517,787
1,563,545
Adana


6
Gaziantep
603,434
853,513
1,192,023
1,235,815
1,495,050
Gaziantep


7
Konya
513,346
742,690
973,791
980,973
1,003,373
Konya


8
Antalya
378,208
603,190
877,945
911,497
955,573
Antalya


9
Kayseri
425,776
536,392
775,594
781,119
911,984
Kayseri


10
Mersin
481,459
633,691
692,300
696,518
842,230
Mersin


11
Eskişehir
413,082
482,793
581,408
599,796
617,215
Eskişehir


12
Diyarbakır
373,810
545,963
613,332
613,821
614,310
Diyarbakır


13
Samsun
322,982
388,509
459,781
461,640
511,601
Samsun


14
Denizli
237,918
357,557
465,947
479,381
492,815
Deni

# Data Transformation

In [11]:
# define column names
COLUMN_NAMES = [
    "Rank",
    "City or Town",
    "1990 Census",
    "2000 Census",
    "2007 Estimate",
    "2008 Estimate",
    "2015 Estimate",
    "Province"
]

In [16]:
# split the raw data into units
entries = raw_data.split('\n')
# remove blank spaces
entries = list(filter(lambda x: x != '', entries))[8:]
print(entries)

['1', 'Istanbul', '6,629,431', '8,803,468', '10,861,463', '10,878,360', '14,025,646', 'Istanbul', '2', 'Ankara', '2,583,963', '3,203,362', '3,842,737', '3,894,182', '4,587,558', 'Ankara', '3', 'İzmir', '1,758,780', '2,232,265', '2,644,531', '2,672,126', '2,847,691', 'İzmir', '4', 'Bursa', '834,576', '1,194,687', '1,537,040', '1,589,530', '1,854,285', 'Bursa', '5', 'Adana', '916,150', '1,130,710', '1,506,272', '1,517,787', '1,563,545', 'Adana', '6', 'Gaziantep', '603,434', '853,513', '1,192,023', '1,235,815', '1,495,050', 'Gaziantep', '7', 'Konya', '513,346', '742,690', '973,791', '980,973', '1,003,373', 'Konya', '8', 'Antalya', '378,208', '603,190', '877,945', '911,497', '955,573', 'Antalya', '9', 'Kayseri', '425,776', '536,392', '775,594', '781,119', '911,984', 'Kayseri', '10', 'Mersin', '481,459', '633,691', '692,300', '696,518', '842,230', 'Mersin', '11', 'Eskişehir', '413,082', '482,793', '581,408', '599,796', '617,215', 'Eskişehir', '12', 'Diyarbakır', '373,810', '545,963', '613,3

In [27]:
# transform data into a list of dictionaries representing rows

final_data = []

counter = 0
row = {}
for unit in entries:
    if counter == 8:
        final_data.append(row)
        row = {}
        counter = 0
    if unit != '-':
        row[COLUMN_NAMES[counter]] = unit
    counter += 1

print(final_data)

[{'Rank': '1', 'City or Town': 'Istanbul', '1990 Census': '6,629,431', '2000 Census': '8,803,468', '2007 Estimate': '10,861,463', '2008 Estimate': '10,878,360', '2015 Estimate': '14,025,646', 'Province': 'Istanbul'}, {'Rank': '2', 'City or Town': 'Ankara', '1990 Census': '2,583,963', '2000 Census': '3,203,362', '2007 Estimate': '3,842,737', '2008 Estimate': '3,894,182', '2015 Estimate': '4,587,558', 'Province': 'Ankara'}, {'Rank': '3', 'City or Town': 'İzmir', '1990 Census': '1,758,780', '2000 Census': '2,232,265', '2007 Estimate': '2,644,531', '2008 Estimate': '2,672,126', '2015 Estimate': '2,847,691', 'Province': 'İzmir'}, {'Rank': '4', 'City or Town': 'Bursa', '1990 Census': '834,576', '2000 Census': '1,194,687', '2007 Estimate': '1,537,040', '2008 Estimate': '1,589,530', '2015 Estimate': '1,854,285', 'Province': 'Bursa'}, {'Rank': '5', 'City or Town': 'Adana', '1990 Census': '916,150', '2000 Census': '1,130,710', '2007 Estimate': '1,506,272', '2008 Estimate': '1,517,787', '2015 Est

## Convert to Dataframe

In [28]:
final_data = pandas.DataFrame(final_data)
final_data.head()

Unnamed: 0,Rank,City or Town,1990 Census,2000 Census,2007 Estimate,2008 Estimate,2015 Estimate,Province
0,1,Istanbul,6629431,8803468,10861463,10878360,14025646,Istanbul
1,2,Ankara,2583963,3203362,3842737,3894182,4587558,Ankara
2,3,İzmir,1758780,2232265,2644531,2672126,2847691,İzmir
3,4,Bursa,834576,1194687,1537040,1589530,1854285,Bursa
4,5,Adana,916150,1130710,1506272,1517787,1563545,Adana


## Clean and Inspect Data

In [29]:
# check for duplicates
num_duplicates = final_data.duplicated().sum()
print(f'There are {num_duplicates} duplicates')

There are 0 duplicates


In [30]:
# inspect data for null entries
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           580 non-null    object
 1   City or Town   580 non-null    object
 2   1990 Census    571 non-null    object
 3   2000 Census    566 non-null    object
 4   2007 Estimate  580 non-null    object
 5   2008 Estimate  580 non-null    object
 6   2015 Estimate  580 non-null    object
 7   Province       580 non-null    object
dtypes: object(8)
memory usage: 36.4+ KB


In [31]:
# drop null
final_data = final_data.dropna()
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 560 entries, 0 to 579
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           560 non-null    object
 1   City or Town   560 non-null    object
 2   1990 Census    560 non-null    object
 3   2000 Census    560 non-null    object
 4   2007 Estimate  560 non-null    object
 5   2008 Estimate  560 non-null    object
 6   2015 Estimate  560 non-null    object
 7   Province       560 non-null    object
dtypes: object(8)
memory usage: 39.4+ KB


In [32]:
# transform numerical columns to the integer data type
def convert_to_int(s):
    s = s.replace(',', '')
    return int(s)

for col in COLUMN_NAMES[2:7]:
    final_data[col] = final_data[col].apply(convert_to_int)

final_data.head()

Unnamed: 0,Rank,City or Town,1990 Census,2000 Census,2007 Estimate,2008 Estimate,2015 Estimate,Province
0,1,Istanbul,6629431,8803468,10861463,10878360,14025646,Istanbul
1,2,Ankara,2583963,3203362,3842737,3894182,4587558,Ankara
2,3,İzmir,1758780,2232265,2644531,2672126,2847691,İzmir
3,4,Bursa,834576,1194687,1537040,1589530,1854285,Bursa
4,5,Adana,916150,1130710,1506272,1517787,1563545,Adana


# Data Loading

In [33]:
final_data.to_csv('Turkish_Cities.csv')