In [87]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 

%matplotlib inline

In [88]:
data=pd.read_csv(r'/Users/smyy.karaalioglu/Desktop/kuggle/air_pollution.csv')

In [89]:
data.columns

Index(['City', ' "Region"', ' "Country"', ' "AirQuality"',
       ' "WaterPollution"'],
      dtype='object')

In [90]:
#Air Quality Column: Air quality varies from 0 (bad quality) to 100 (top good quality)
#Water Pollution Column: Water pollution varies from 0 (no pollution) to 100 (extreme pollution)

data.head(5)

Unnamed: 0,City,"""Region""","""Country""","""AirQuality""","""WaterPollution"""
0,New York City,"""New York""","""United States of America""",46.816038,49.50495
1,"Washington, D.C.","""District of Columbia""","""United States of America""",66.129032,49.107143
2,San Francisco,"""California""","""United States of America""",60.514019,43.0
3,Berlin,"""""","""Germany""",62.36413,28.612717
4,Los Angeles,"""California""","""United States of America""",36.621622,61.299435


In [91]:
data.dtypes

City                  object
 "Region"             object
 "Country"            object
 "AirQuality"        float64
 "WaterPollution"    float64
dtype: object

In [92]:
data.isnull().sum()

City                 0
 "Region"            0
 "Country"           0
 "AirQuality"        0
 "WaterPollution"    0
dtype: int64

In [93]:
data.nunique()

City                 3796
 "Region"            1153
 "Country"            177
 "AirQuality"         667
 "WaterPollution"     561
dtype: int64

In [94]:
#Cleaning white space from column names

data.columns = data.columns.str.strip()

In [95]:
#Cleaning Column Headers

data.columns = data.columns.str.replace('"', '')

In [96]:
data.rename(columns={'"Country"':'Country'},inplace=True)
data.rename(columns={'"Region"':'Region'},inplace=True)
data.rename(columns={'"AirQuality"':'AirQuality'},inplace=True)
data.rename(columns={'"WaterPollution"':'WaterPollution'},inplace=True)

In [97]:
data.head(1)

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,"""New York""","""United States of America""",46.816038,49.50495


In [98]:
#There are still unexpected characters in column names

import re

def clean_column_name(col_name):
    return re.sub(r'\W+', '', col_name)


data.columns = [clean_column_name(col) for col in data.columns]

In [99]:
data[data.duplicated()].count()

City              0
Region            0
Country           0
AirQuality        0
WaterPollution    0
dtype: int64

In [101]:
#Cleaning rows

data['Country'] = data['Country'].str.replace('"', '')
data['Region'] = data['Region'].str.replace('"', '')
data.head(2)

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.816038,49.50495
1,"Washington, D.C.",District of Columbia,United States of America,66.129032,49.107143


In [102]:
data['AirQuality'] = data['AirQuality'].round(2)
data.head(2)

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.82,49.50495
1,"Washington, D.C.",District of Columbia,United States of America,66.13,49.107143


In [103]:
data['WaterPollution'] = data['WaterPollution'].round(2)
data.head(2)


Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.82,49.5
1,"Washington, D.C.",District of Columbia,United States of America,66.13,49.11


In [104]:
# Air quality varies from 0 (bad quality) to 100 (top good quality)
# Water pollution varies from 0 (no pollution) to 100 (extreme pollution)

#Which Cities have the highest air quality and lowest water pollution levels?
#248 cities.

data[(data['WaterPollution'] == 0 ) & (data['AirQuality']== 100)].sort_values(by='Country')


Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
2785,Farah,Farah,Afghanistan,100.0,0.0
2862,Permet,Gjirokaster County,Albania,100.0,0.0
3614,Khenchela,Khenchela Province,Algeria,100.0,0.0
3315,Tebessa,Tebessa Province,Algeria,100.0,0.0
3716,San Isidro,Buenos Aires Province,Argentina,100.0,0.0
...,...,...,...,...,...
3295,San Marcos,Texas,United States of America,100.0,0.0
3294,Leander,Texas,United States of America,100.0,0.0
3534,Woodland,Washington,United States of America,100.0,0.0
3307,Oxford,Mississippi,United States of America,100.0,0.0


In [116]:
#Which Cities have the lowest air quality and highest water pollution level?
#70 cities.

data[(data['WaterPollution'] == 100 ) & (data['AirQuality']== 0)].sort_values(by='Country')

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
2144,Lushnje,Fier County,Albania,0.0,100.0
3734,Skikda,Skikda Province,Algeria,0.0,100.0
3226,Avellaneda,Santa Fe Province,Argentina,0.0,100.0
2654,Bendigo,Victoria,Australia,0.0,100.0
3931,Beringen,Flemish Region,Belgium,0.0,100.0
...,...,...,...,...,...
3179,Bloomfield Hills,Michigan,United States of America,0.0,100.0
3704,Stone Mountain,Georgia,United States of America,0.0,100.0
2765,Cabudare,Lara,Venezuela,0.0,100.0
2528,Ta'izz,Ta'izz Governorate,Yemen,0.0,100.0
