## Importing CSV files

In [1]:
with open('titanic.csv') as f:   # reading from a .csv
    text = f.readlines()

text is too long, no point in seeing it here and let it take 1000 lines of space in Github

In [29]:
len(text)

892

The pandas method:

In [5]:
import pandas as pd

In [6]:
pd.read_csv('titanic.csv')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


index_col parameter

In [7]:
pd.read_csv('titanic.csv', index_col = 'pclass')

Unnamed: 0_level_0,survived,sex,age,sibsp,parch,fare,embarked,deck
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,0,male,22.0,1,0,7.2500,S,
1,1,female,38.0,1,0,71.2833,C,C
3,1,female,26.0,0,0,7.9250,S,
1,1,female,35.0,1,0,53.1000,S,C
3,0,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...
2,0,male,27.0,0,0,13.0000,S,
1,1,female,19.0,0,0,30.0000,S,B
3,0,female,,1,2,23.4500,S,
1,1,male,26.0,0,0,30.0000,C,C


header parameter

In [16]:
pd.read_csv('titanic.csv', header = 0)  # define which row should serve as headero of the dataframe, it can be none as well

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


names parameter to change the column labels

In [17]:
pd.read_csv('titanic.csv', header = 0, names = ['alive', 'class', 'gender', 'age', 'sibsp', 'parch', 'price', 'emb', 'deck'])

Unnamed: 0,alive,class,gender,age,sibsp,parch,price,emb,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


usecols parameter to tell only which columns to import

In [23]:
titanic = pd.read_csv('titanic.csv', header = 0, index_col= 'pclass', usecols = ['survived', 'pclass', 'sex', 'age'])

In [24]:
titanic.head()

Unnamed: 0_level_0,survived,sex,age
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,0,male,22.0
1,1,female,38.0
3,1,female,26.0
1,1,female,35.0
3,0,male,35.0


changing column labels

In [25]:
titanic.columns = ['alive', 'gender', 'age']

In [27]:
titanic.index.name = 'class'

In [28]:
titanic.head()

Unnamed: 0_level_0,alive,gender,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,0,male,22.0
1,1,female,38.0
3,1,female,26.0
1,1,female,35.0
3,0,male,35.0


## Importing messy CSV files

In [1]:
import pandas as pd

In [11]:
pd.read_csv('titanic_raw.csv')  # messy data

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,This,is,the,Titanic,Dataset.
It,contains,all,passengers,that,were,aboard,the,titanic,when
the,tragic,disaster,occured,in,the,year,19,1,2
0,0,3,male,22.0,1,0,$7.25,S,
1,1,1,female,38.0,1,0,$71.2833,C,C
2,1,3,female,26.0,0,0,$7.925,S,
...,...,...,...,...,...,...,...,...,...
891,0,2,male,24.0,0,0,$10.5,S,
892,0,3,male,34.0,1,1,$14.4,S,
893,0,3,male,36.0,0,0,$7.8958,S,
End,of,the,Dataset,,,,,,


In [5]:
col_names = ['Survived', 'Class', 'Gender', 'Age', 'Sibsp', 'Parch', 'Fare', 'Emb', 'Deck']

In [12]:
titanic = pd.read_csv('titanic_raw.csv', skiprows = 3, skipfooter = 2, header =  None, names = col_names)

  return func(*args, **kwargs)


In [13]:
titanic.head()

Unnamed: 0,Survived,Class,Gender,Age,Sibsp,Parch,Fare,Emb,Deck
0,0,3,male,22.0,1,0,$7.25,S,
1,1,1,female,38.0,1,0,$71.2833,C,C
2,1,3,female,26.0,0,0,$7.925,S,
3,1,1,female,35.0,1,0,$53.1,S,C
4,0,3,male,35.0,0,0,$8.05,S,


In [18]:
titanic.to_csv('titanic_clean.csv', index = False)  # saving the clean csv to disk

In [19]:
pd.read_csv('titanic_clean.csv')  # another range index is saved here if index = True during saving

Unnamed: 0,Survived,Class,Gender,Age,Sibsp,Parch,Fare,Emb,Deck
0,0,3,male,22.0,1,0,$7.25,S,
1,1,1,female,38.0,1,0,$71.2833,C,C
2,1,3,female,26.0,0,0,$7.925,S,
3,1,1,female,35.0,1,0,$53.1,S,C
4,0,3,male,35.0,0,0,$8.05,S,
...,...,...,...,...,...,...,...,...,...
889,1,1,male,26.0,0,0,$30.0,C,C
890,0,3,male,32.0,0,0,$7.75,Q,
891,0,2,male,24.0,0,0,$10.5,S,
892,0,3,male,34.0,1,1,$14.4,S,


## Importing from Excel Files with pd.read_excel( )

### First steps

In [20]:
import pandas as pd

In [27]:
sales = pd.read_excel('sales.xls')   # importing excel and converting it to dataframe

by default, first column is the index of the dataframe, i.e the sales column, but also a rangeIndex

In [28]:
sales

Unnamed: 0.1,Unnamed: 0,City,Country,Sales,Bonus
0,Mike,New York,USA,25,2.5
1,Jim,Boston,USA,43,4.3
2,Steven,London,UK,76,7.6
3,Joe,Madrid,Spain,12,1.8
4,Tom,Paris,France,89,13.4


let's change that!

In [30]:
sales = pd.read_excel('sales.xls', index_col = 0 )  # seting sales as index

In [31]:
sales

Unnamed: 0,City,Country,Sales,Bonus
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.4


In [32]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Mike to Tom
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   City     5 non-null      object 
 1   Country  5 non-null      object 
 2   Sales    5 non-null      int64  
 3   Bonus    5 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


In [33]:
pd.read_excel('sales.xls', index_col = 0, header = 0)        # header can also be 'None'

Unnamed: 0,City,Country,Sales,Bonus
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.4


changing column labels

In [35]:
pd.read_excel('sales.xls', index_col = 0, header = 0, names = ['name', 'Loc_city', 'loc_country', 'revenue', 'Add_comp'])

Unnamed: 0_level_0,Loc_city,loc_country,revenue,Add_comp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.4


specify which columns to use

In [36]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = "A:C")    # columns have Alphabetical labels in excel, both including

Unnamed: 0,City,Country
Mike,New York,USA
Jim,Boston,USA
Steven,London,UK
Joe,Madrid,Spain
Tom,Paris,France


In [37]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = 'A, C:E')

Unnamed: 0,Country,Sales,Bonus
Mike,USA,25,2.5
Jim,USA,43,4.3
Steven,UK,76,7.6
Joe,Spain,12,1.8
Tom,France,89,13.4


In [39]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = 'C:')  # doesn't work

IndexError: pop from empty list

In [38]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = [0,3,4])

Unnamed: 0,Sales,Bonus
Mike,25,2.5
Jim,43,4.3
Steven,76,7.6
Joe,12,1.8
Tom,89,13.4


In [44]:
pd.read_excel('sales.xls', index_col = 0, header = 0, usecols = ['City', 'Sales'])   # not recommended tho, use excel labels

Unnamed: 0_level_0,Sales
City,Unnamed: 1_level_1
New York,25
Boston,43
London,76
Madrid,12
Paris,89
