# Pandas

In [None]:
import numpy as np
import pandas as pd
np.set_printoptions(precision=4)

### Working With CSV files

In [None]:
Data = pd.read_csv('../input/pandas-practice-files/Required files/ex1.csv')    
print(type(Data))             
Data.head(4)

### Working with TSV file (Tab Seprated Values)

In [None]:
Data_tsv = pd.read_table('../input/pandas-practice-files/Required files/test.tsv') 
Data_tsv.head()  

In [None]:
# Reading tsv using read_csv
pd.read_table('../input/pandas-practice-files/Required files/ex1.csv', sep=',')
# If We will not use sep=',' parameter it will read all the data into one column

In [None]:
pd.read_table('../input/pandas-practice-files/Required files/ex1.csv', sep=',',header=None)

In [None]:
# If we want our complete data, dont want to include it in header
pd.read_csv('../input/pandas-practice-files/Required files/ex2.csv', header=None)

In [None]:
# If we want to new columns explicitly
df =pd.read_csv('../input/pandas-practice-files/Required files/ex2.csv', names=['asdfdsfs','fsdf', 'b', 'c', 'sudh', 'message'])
# for those columns which are not present in dataframe it will put Nan values
df

In [None]:
# We can set index by passing column names, Which can be used for multi-level indexing also
parsed = pd.read_csv('../input/pandas-practice-files/Required files/csv_mindex.csv',index_col=['key1', 'key2'])
parsed

In [None]:
# We can Skip the uncessary data to be loaded into our DataFrame by passing the index of rows we want to skip.
# !cat ex4.csv
pd.read_csv('../input/pandas-practice-files/Required files/ex4.csv', skiprows=[0, 2, 3])

In [None]:
result = pd.read_csv('../input/pandas-practice-files/Required files/ex5.csv')
result
# To check is there any NaN Value in our DataFrame
pd.isnull(result)
# It will Return a DataFrame of True where there is NaN value and False Where there is not a Nan Value.

In [None]:
# We can Set what Value to be filled in Nan Values
result = pd.read_csv('../input/pandas-practice-files/Required files/ex5.csv',na_values='world')
result

In [None]:
# We can read data from Diffrent Sheets within a Single file also.
draft3 = pd.read_excel('../input/pandas-practice-files/Required files/ex1.xlsx',sheet_name = 0) # Name of sheet to read from
draft3.head(6)

### Reading HTML tables
#### Note: It will the only data which is present in tabular form on website.

In [None]:
!pip3 install lxml

In [None]:
url = "http://www.basketball-reference.com/leagues/NBA_2015_totals.html"
BB_data = pd.read_html(url)         # Read data from the specified url
BB_data[0].iloc[:, 0:20].head(5)      # Check 5 rows (10 columns only)

In [None]:
titanic_train = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv",
                           sep='\t')

In [None]:
titanic_train.head(10) # checking 10 rows

In [None]:
titanic_train.columns

In [None]:
# we set others string or charater to be treated as Nan values
result = pd.read_csv('../input/pandas-practice-files/Required files/ex5.csv',na_values='world')
result

In [None]:
# Selecting Column with index ('Name','Pclass')
titanic_train[["Name","Pclass"]].head() # by default head read first 5 rows

In [None]:
# Checking Data type of each column
titanic_train.dtypes

In [None]:
# Python allows us to customise some Settings
pd.options.display.max_rows = 10

In [None]:
result = pd.read_csv('../input/pandas-practice-files/Required files/ex6.csv')
result

In [None]:
# It will only read 5 rows to our dataSet, We can save our resources in this way
pd.read_csv('../input/pandas-practice-files/Required files/ex6.csv', nrows=5)

In [None]:
chunk = pd.read_csv('../input/pandas-practice-files/Required files/ex6.csv', chunksize=100)
for i in chunk:
    print(i)

### Writing Data to Text Format

In [None]:
data = pd.read_csv('../input/pandas-practice-files/Required files/ex5.csv')
data

In [None]:
# Saving our DataFrame to file csv
data.to_csv('Processed_DataFrame.csv')

In [None]:
import sys
data.to_csv('out1.csv', sep='@')

In [None]:
data.to_csv(sys.stdout, na_rep='Purvansh')

In [None]:
data.to_csv('out2.csv', index=False, header=False,sep=',')

In [None]:
data.to_csv('out3.csv', index=False, columns=['a', 'b', 'c'])

In [None]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('tseries.csv')
dates =pd.DataFrame(dates)
dates.to_csv('Dates.csv')
#! tseries.csv

Working with Delimited Formats

In [None]:
import csv
f = open('../input/pandas-practice-files/Required files/ex7.csv')
reader = csv.reader(f)
for text in reader:
    print(text)
reader

In [None]:
with open('../input/pandas-practice-files/Required files/ex7.csv') as f:
    lines = list(csv.reader(f))
print(lines)

In [None]:
header, values = lines[0], lines[1:]

In [None]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

In [None]:
pd.DataFrame(data_dict,index=['one','two'])

## Working With JSON Data

In [None]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [None]:
import json
result = json.loads(obj)
print(type(result))
result

In [None]:
asjson = json.dumps(result)
print(type(asjson))
asjson

In [None]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age','pets'])
siblings

In [None]:
data = pd.read_json('../input/pandas-practice-files/Required files/example.json')
data

In [None]:
print(data.to_json())
print(data.to_json(orient='records'))

XML and HTML: Web Scraping

In [None]:
tables = pd.read_html('fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()

In [None]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()
#close_timestamps

# Binary Data Formats

In [None]:
frame = pd.read_csv('../input/pandas-practice-files/Required files/ex1.csv')
frame
frame.to_pickle('frame_pickle')

In [None]:
pd.read_pickle('frame_pickle')

Using HDF5 Format

In [None]:
!pip install tables

In [None]:
# HDF5 Hierarchical Data Format (HDF) is an open source file format for storing huge amounts of numerical data.
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

In [None]:
store['obj1'].head()

In [None]:
store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()

In [None]:
frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])

# Web APIs

In [None]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
type(resp)
print(resp)

In [None]:
data = resp.json()
data[2]['user']

In [None]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

# Interacting with Databases

In [None]:
import sqlite3
#connecting with the database.
db = sqlite3.connect("my_database4.db")
# Drop table if it already exist using execute() method.
db.execute("drop table if exists test")
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata2.sqlite')
con.execute(query)
con.commit()

In [None]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [None]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

In [None]:
print(cursor.description)
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

In [None]:
import sqlite3
#connecting with the database.
db = sqlite3.connect("my_database5.db")
# Drop table if it already exist using execute() method.
db.execute("drop table if exists grades1")
# Create table as per requirement
db.execute("create table grades1(id int, name text, score int)")
#inserting values inside the created table
db.execute("insert into grades1(id, name, score) values(101, 'John',99 )")
db.execute("insert into grades1(id, name, score) values(102, 'Gary',90 )")
db.execute("insert into grades1(id, name, score) values(103, 'James', 80 )")
db.execute("insert into grades1(id, name, score) values(104, 'Cathy', 85 )")
db.execute("insert into grades1(id, name, score) values(105, 'Kris',95 )")

In [None]:
db.commit()

In [None]:
results = db.execute("select * from grades1 order by id")
for row in results:
    print((row))
print("-" * 60 )

In [None]:
results = db.execute("select * from grades1 where name = 'Gary' ")
for row in results: print(row)
print("-"* 60 )

In [None]:
results = db.execute("select * from grades1 where score >= 90 ")
for row in results:
    print(row)
print("-" * 60 )

In [None]:
results = db.execute("select name, score from grades1 order by score desc ")
for row in results:
    print(row)
print("-" * 60 )

In [None]:
results = db.execute("select name, score from grades1 order by score")
for row in results:
    print(row)
print("-" * 60 )

In [None]:
results = db.execute("select name, score from grades1 order by score")
for row in results:
    print(row)