# (1) Loading CSV files

In [4]:
import pandas as pd
#import numpy as np

## Basic file inspection

In [None]:
!pwd 

In [None]:
!ls

In [None]:
# count the number of lines
!wc -l "data1.csv" 

In [None]:
# print file to screen
!cat "data1.csv" 

In [None]:
# print the first 3 lines
!head -3 "data1.csv" 

## Loading CSV Files to DataFrames

In [None]:
df = pd.read_csv("data1.csv")
print(df)

### Handling missing header

In [None]:
!cat "data1.csv"

In [None]:
!cat "data2.csv"

In [None]:
pd.read_csv("data2.csv")

In [None]:
pd.read_csv("data2.csv", header=None)

In [None]:
pd.read_csv("data2.csv", header=None, names=['Language','Wiki','Articles','Pages','Edits'])

### Skip lines in CSV file

In [None]:
pd.read_csv('data3.csv') # expect an error

In [None]:
!cat "data3.csv"

In [None]:
pd.read_csv('data3.csv',skiprows=5)

In [None]:
pd.read_csv('data3.csv',skiprows=[0,1,2,3,4]) # alternative usage

### * Custom column separator

In [None]:
!cat "data1_sep.csv"

In [None]:
pd.read_csv("data1_sep.csv")

In [None]:
pd.read_csv("data1_sep.csv", sep=";")

### * Custom number specification 

In [None]:
df = pd.read_csv('data4.csv')
print(df)

In [None]:
!cat data4.csv

In [None]:
# print the total number of admins
df['Admins'].sum()

In [None]:
# print the total number of Articles
df['Articles'].sum()

In [None]:
df.dtypes

In [None]:
df2 = pd.read_csv("data4.csv", thousands=',')

In [None]:
df2.dtypes

In [None]:
df2['Articles'].sum()

### * Missing values

In [None]:
!cat "data5missing.csv"

In [None]:
pd.read_csv("data5missing.csv")

In [None]:
pd.read_csv("data5missing.csv", na_values=["?"])

### Processing Large Files

In [None]:
# reading compressed files (pandas 0.18.1 supports the formats ‘gzip’, ‘bz2’, ‘zip’, ‘xz’)
pd.read_csv("data6.zip")

In [None]:
# read 3 first rows of a file
pd.read_csv("data6.zip", nrows=3)

In [None]:
# longer file
!cat "data7longer.csv"

In [None]:
!wc "data7longer.csv"

In [None]:
# reading file in chunks
active_users = 0
for df in pd.read_csv('data7longer.csv', chunksize=30):
    print('\nBeginning of DataFrame\n')
    print(df)
    active_users += sum(df['Active Users'])
print('Total number of Active Users', active_users)

### Commonly used parameters

## Exporting Dataframes to CSV files

In [None]:
df1 = pd.read_csv("data1.csv")
df1

In [None]:
df1.to_csv('data_out.csv')

In [None]:
!cat data_out.csv

In [None]:
df1.to_csv('data_out.csv', index=False)
!cat data_out.csv

In [None]:
df1.to_csv('data_out.csv', index=False, columns=['Language','Articles'])
!cat data_out.csv

In [None]:
# Additional exporting options: 
#
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html

## * Alternatives to pandas read_csv()

### Line by line text reading

In [None]:
f = open('data1.csv','r')
for l in f:
    print(l)
f.close()

### Line by line text parsing with the built-in package csv

In [7]:
import csv
# example 1 - print the parsed csv file
csvfile=open('data1.csv')
for row in csv.reader(csvfile, delimiter=',', quotechar='"'):
    print(row)
csvfile.close()

['Language', 'Wiki', 'Articles', 'Pages', 'Edits']
['English', 'en', '5236357', '40155905', '847883223']
['Swedish', 'sv', '3445556', '6910072', '36854915']
['Cebuano', 'ceb', '2885091', '5354566', '10393518']
['German', 'de', '1976573', '5699722', '162755765']
['Dutch', 'nl', '1873902', '3637030', '48492474']


In [8]:
# example 2 - read all lines
lines = list(csv.reader(open('data1.csv')))
header = lines[0]
values = lines[1:]
pd.DataFrame(data=values, columns=header)

Unnamed: 0,Language,Wiki,Articles,Pages,Edits
0,English,en,5236357,40155905,847883223
1,Swedish,sv,3445556,6910072,36854915
2,Cebuano,ceb,2885091,5354566,10393518
3,German,de,1976573,5699722,162755765
4,Dutch,nl,1873902,3637030,48492474


In [9]:
# example 3 - read first 4 lines
csvfile = open('data1.csv')
csvReader = csv.reader(csvfile)
header = next(csvReader)
values = [next(csvReader) for i in range(4)]
csvfile.close()
pd.DataFrame(data=values, columns=header)        

Unnamed: 0,Language,Wiki,Articles,Pages,Edits
0,English,en,5236357,40155905,847883223
1,Swedish,sv,3445556,6910072,36854915
2,Cebuano,ceb,2885091,5354566,10393518
3,German,de,1976573,5699722,162755765


## Class Exercise 1
The format of the file "data1.csv" has been modified and saved as  "data1modified.csv"
Load the file "data1modified.csv" into a DataFrame equivalent to df.read_csv("data1.csv") 
You may use df1==df2 for comparison.

Hint: start by printing the two files to screen

In [5]:
# write your solution here
!cat data1.csv
!cat data1modified.csv
df1 = pd.read_csv("data1.csv")
df2 = pd.read_csv("data1modified.csv", sep=';', names=['num','Language','Wiki','Articles','Pages','Edits'],usecols=['Language','Wiki','Articles','Pages','Edits'])
df1==df2

Language,Wiki,Articles,Pages,Edits
English,en,5236357,40155905,847883223
Swedish,sv,3445556,6910072,36854915
Cebuano,ceb,2885091,5354566,10393518
German,de,1976573,5699722,162755765
Dutch,nl,1873902,3637030,48492474
11;English;en;5236357;40155905;847883223
12;Swedish;sv;3445556;6910072;36854915
13;Cebuano;ceb;2885091;5354566;10393518
14;German;de;1976573;5699722;162755765
15;Dutch;nl;1873902;3637030;48492474


Unnamed: 0,Language,Wiki,Articles,Pages,Edits
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True


<br style=margin:500px;>
___

In [None]:
!cat data1.csv

In [None]:
!cat data1modified.csv

In [None]:
# Solution
df1 = pd.read_csv("data1.csv")
df2 = pd.read_csv("data1modified.csv", sep=';', names=['num','Language','Wiki','Articles','Pages','Edits'],usecols=['Language','Wiki','Articles','Pages','Edits'])
df1==df2

## Class Exercise 2
Explore the file "data pageviews-20160801-000000"
  1. What is the size of the file? (3705962bytes = 3.7mb)
  2. Print the first 10 rows
  3. Load the file into a DataFrame. Set the column names to: "Project","Title","Pageviews","x"
  4. What is the total number of rows? (df.shape, 100,000)
  5. What is the total number of Pageviews? (df["Pageviews"].sum() 233,058)

In [None]:
# write your solution here

<br style=margin:500px;>
___

In [None]:
# Solution
# 1) Check file sizes 
!ls -l

In [None]:
# 2) Print the first 10 rows
!head -10 "data pageviews-20160801-000000"

In [None]:
# 3) Load the file into a DataFrame
df = pd.read_csv("data pageviews-20160801-000000", sep=" ", names=["Project","Title","Pageviews","x"],encoding='utf8')

In [None]:
# 4) print the dimensions of the loaded dataframe
df.shape

In [None]:
# 5) compute the total nuber of pageviews
df["Pageviews"].sum()

In [None]:
df.head(10)