# Open a .tsv file

We will use a synthetic dataset on individuals taking intelligence and emotional quotient tests. 

The data consists of three variables: The rank of a test-taker, their name, and their test type (IQ and EQ).

In [1]:
import pandas as pd

## Using `open`

In [2]:
tiny_tsv_list = []
with open("tiny_tsv.tsv") as tsv_file:
    for line in tsv_file:
        tiny_tsv_list.append(line.rstrip('\n').split('\t'))

tiny_tsv_list

[['rank', 'name', 'test_type'],
 ['1', 'Adam', 'IQ'],
 ['2', 'Maria', 'EQ'],
 ['3', 'Jenny', 'IQ'],
 ['4', 'Amira', 'IQ'],
 ['5', 'Ola', 'EQ']]

## Using `pandas`

In [3]:
import pandas as pd

df_tsv = pd.read_csv('tiny_tsv.tsv', sep='\t')
df_tsv

Unnamed: 0,rank,name,test_type
0,1,Adam,IQ
1,2,Maria,EQ
2,3,Jenny,IQ
3,4,Amira,IQ
4,5,Ola,EQ


# Open a .csv file from a zipped archive

Now let's open the same data stored in a zipped .csv file.

## OPTION 1 using `pandas`

In [4]:
#OPTION 1
direct_df = pd.read_csv('tiny_csv.zip')
direct_df

Unnamed: 0,rank;name;test_type
0,1;Adam;IQ
1,2;Maria;EQ
2,3;Jenny;IQ
3,4;Amira;IQ
4,5;Ola;EQ
5,6;Jerry;No Test Taken
6,7;Kai;EQ


In [5]:
#OPTION 1
#Fail to unzip multiple files
pd.read_csv('multi_tiny.zip')

ValueError: Multiple files found in ZIP file. Only one file per ZIP: ['tiny_csv.csv', 'tiny_tsv.tsv']

## OPTION 2 using `zipfile`

In [6]:
#OPTION 2
import zipfile

with zipfile.ZipFile("tiny_csv.zip","r") as zip_ref:
    zip_ref.extractall("tiny_csv/")
    
indirect_df = pd.read_csv('tiny_csv/tiny_csv.csv')
indirect_df

Unnamed: 0,rank;name;test_type
0,1;Adam;IQ
1,2;Maria;EQ
2,3;Jenny;IQ
3,4;Amira;IQ
4,5;Ola;EQ
5,6;Jerry;No Test Taken
6,7;Kai;EQ


In [7]:
#Proceed with Option 2
#Now, do the following in one command:
#1. Define the seperator or delimiter as semicolon
#2. Defining the header as the first (0th) row of the .csv
#3. Specifying the data types of the individual columns
#4. Denoting the rank column as the index.
#5. Marketing the 'No Test Taken' values as NaNs 
    
indirect_df = pd.read_csv('tiny_csv/tiny_csv.csv', 
                        sep=';', 
                        header=0,                        
                        dtype={'rank':'int',
                              'name':'string',
                             'test_type':'string'},
                        index_col='rank',
                        na_values='No Test Taken')
indirect_df

Unnamed: 0_level_0,name,test_type
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adam,IQ
2,Maria,EQ
3,Jenny,IQ
4,Amira,IQ
5,Ola,EQ
6,Jerry,
7,Kai,EQ
