# Working with Data

**Opening a csv file from url**

In [4]:
import numpy as np
import pandas as pd

In [6]:
import requests
from io import StringIO

url = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/201000101 Firefox/66.0"}
req = requests.get(url, headers = headers)
data = StringIO(req.text)

pd.read_csv(data)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA
...,...,...
189,Paraguay,SOUTH AMERICA
190,Peru,SOUTH AMERICA
191,Suriname,SOUTH AMERICA
192,Uruguay,SOUTH AMERICA


In [None]:
pd.read_csv('movie_titles_metadata.tsv', sep = '\t', names=['sno', 'name', 'release_year', 'rating', 'votes', 'genres']) 
# use when column names are not defined
# for tsv(Tab separated file) file

In [8]:
# Index_col Parameter
pd.read_csv('', index_col = 'column_name') 
# will remove the default indexing and create the column(provided) as index

In [None]:
# Header Parameter
pd.read_csv('', header = 1) 
# will create 1st row as Header
# when column name is not there or column name is using as row snippet

In [None]:
# use_cols parameter
pd.read_csv('', usecols = ['column1','col2','col3','col4'])
# Only those features will appear which we required

In [None]:
# Skiprows/Nrows Parameter
pd.read_csv('', skiprows = [0,1])
# WIll skip 1st and 2nd row
# can also write login to remove rows in multiple of 5, or remove odd rows, even rows

pd.read_csv('', nrows = 100)
# Only 100 rows will come now
# when using 10,00,000 rows

In [None]:
# Encoding Parameter
pd.read_csv('', encoding = 'latin-1') 
# will provide the encoding parameter of the file if not UTF.
# can also change encoding parameter in Sublime

In [None]:
# Skipping lines
pd.read_csv('')  # Error -> Expected 8 fields on line 65, saw 9
pd.read_csv('', error_bad_lines = False)
# Will skip these lines

In [None]:
# Dtypes Paramters
pd.read_csv('',dtype={'colName whose dtype we want from float to int':int})

In [None]:
# Handling Dates
# Sometimes dates are passed as strings, therefore we cannot use dates features
pd.read_csv('').info() # Date Feature will appear as Object ie String
# To Parse to date Object
pd.read_csv('', parse_date = ['colName which we need to parse'])
# It will treat as date and we can implement date features now.

**CONVERTORS**

In [None]:
def rename(name):
    if name == "Name present in the row":
        return "The new_name we want there"
    else:
        return name

In [None]:
rename("Name we will be passing which is present in the row and will be replacing by new_name")

In [None]:
pd.read_csv('', convertors = {'col_name where we want to see changes': rename})
# Transformaton will be applied

**Missing Values**

In [None]:
# Missing values -> NaN or '-'
pd.read_csv('', na_values = ['Male', 'Female', 'etc'])
# Will replace the male to NaN present in the rows.

**Chunks**

In [None]:
dfs = pd.read_csv('', chunksize = 5000)
# Will divide in chunks

In [None]:
for chunks in dfs:
    print(chunk.shape)   
# (4158,14)
# (4158,14)
# (4158,14)
# (4158,14)
 