# Clean your Data With NumPy and Pandas

In [1]:
#Import Library
import numpy as np
import pandas as pd
import re

In [2]:
# Read Data from your datasets folder
data = pd.read_fwf('../datasets/clean-data/university_towns.txt', sep=" ",header=None)

<b>'fwf' stands for Fixed Width Formatted Lines.</b><br />
<b>sep=" " is used for spacing </b><br />
<b> header = None is used as the file has no heading </b>

In [3]:
# Create DataFrame from the  file
data.head(10) # To see First Five entery with column names

Unnamed: 0,0
0,Alabama[edit]
1,Auburn (Auburn University)[1]
2,Florence (University of North Alabama)
3,Jacksonville (Jacksonville State University)[2]
4,Livingston (University of West Alabama)[2]
5,Montevallo (University of Montevallo)[2]
6,Troy (Troy University)[2]
7,"Tuscaloosa (University of Alabama, Stillman Co..."
8,Tuskegee (Tuskegee University)[5]
9,Alaska[edit]


In [4]:
# Another way to access the File
#!more 'datasets/clean-data/university_towns.txt'

In [5]:
# Now Separate Towns and State as Each State has '[edit]' word
town = []
with open('../datasets/clean-data/university_towns.txt') as file:
    for item in file:
        if '[edit]' in item:
            state = item
        else:
            town.append((state,item))

In [6]:
town[:5]

[('Alabama[edit]\n', 'Auburn (Auburn University)[1]\n'),
 ('Alabama[edit]\n', 'Florence (University of North Alabama)\n'),
 ('Alabama[edit]\n', 'Jacksonville (Jacksonville State University)[2]\n'),
 ('Alabama[edit]\n', 'Livingston (University of West Alabama)[2]\n'),
 ('Alabama[edit]\n', 'Montevallo (University of Montevallo)[2]\n')]

In [7]:
# now make dataframe
town_df = pd.DataFrame(town,columns=['State','Town'])
town_df.head()

Unnamed: 0,State,Town
0,Alabama[edit]\n,Auburn (Auburn University)[1]\n
1,Alabama[edit]\n,Florence (University of North Alabama)\n
2,Alabama[edit]\n,Jacksonville (Jacksonville State University)[2]\n
3,Alabama[edit]\n,Livingston (University of West Alabama)[2]\n
4,Alabama[edit]\n,Montevallo (University of Montevallo)[2]\n


In [8]:
# Now Clean Data Columns
def get_clean(item):
    if ' (' in item:
        return item[:item.find(' (')]
    elif '[' in item:
        return item[:item.find('[')]
    else:
        return item
town_df.applymap(get_clean)


Unnamed: 0,State,Town
0,Alabama,Auburn
1,Alabama,Florence
2,Alabama,Jacksonville
3,Alabama,Livingston
4,Alabama,Montevallo
5,Alabama,Troy
6,Alabama,Tuscaloosa
7,Alabama,Tuskegee
8,Alaska,Fairbanks
9,Arizona,Flagstaff


<b> What about University names how to get it</b>

In [9]:
city_df = pd.DataFrame(town,columns=['State','Town'])
city_df.head()

Unnamed: 0,State,Town
0,Alabama[edit]\n,Auburn (Auburn University)[1]\n
1,Alabama[edit]\n,Florence (University of North Alabama)\n
2,Alabama[edit]\n,Jacksonville (Jacksonville State University)[2]\n
3,Alabama[edit]\n,Livingston (University of West Alabama)[2]\n
4,Alabama[edit]\n,Montevallo (University of Montevallo)[2]\n


In [10]:
# Use of astype(str) to convert the result to string otherwise it will be float
new = pd.DataFrame(city_df["Town"].apply(lambda x: re.findall('\((.*?)\)',x))).astype(str) 
new.head()


Unnamed: 0,Town
0,['Auburn University']
1,['University of North Alabama']
2,['Jacksonville State University']
3,['University of West Alabama']
4,['University of Montevallo']


<b>Link for Pandas Data Types:<br />
https://pbpython.com/pandas_dtypes.html </b>

In [11]:
city_df["University"]=new

In [12]:
city_df["Town"]= city_df["Town"].str.split("(", n = 1, expand = True) 
city_df["State"]= city_df["State"].str.split("[", n = 1, expand = True) 

In [13]:
city_df.head()

Unnamed: 0,State,Town,University
0,Alabama,Auburn,['Auburn University']
1,Alabama,Florence,['University of North Alabama']
2,Alabama,Jacksonville,['Jacksonville State University']
3,Alabama,Livingston,['University of West Alabama']
4,Alabama,Montevallo,['University of Montevallo']


In [14]:
city_df['University'].dtype

dtype('O')

In [15]:
city_df['State'].dtype

dtype('O')

In [16]:
city_df['Town'].dtype

dtype('O')

In [17]:
# Replacing [' with space and '] with space
city_df["University"]=city_df["University"].str.replace(r"\['","")
city_df["University"]=city_df["University"].str.replace(r"\']","")

# Data exploration and Basic Hygiene

In [18]:
# Print Missing value
print(city_df.isnull().sum())

State         0
Town          0
University    0
dtype: int64


In [19]:
city_df.head()

Unnamed: 0,State,Town,University
0,Alabama,Auburn,Auburn University
1,Alabama,Florence,University of North Alabama
2,Alabama,Jacksonville,Jacksonville State University
3,Alabama,Livingston,University of West Alabama
4,Alabama,Montevallo,University of Montevallo
