# Getting started
To use pandas, you'll typically start with the following line of code.

In [None]:
import pandas as pd
import numpy as np

#pip install pandas

## 1. Creating data
There are two core objects in pandas: the <font color='red'><b> DataFrame </b></font> and the <font color='red'><b> Series </b></font>.

### 1.1. DataFrame
A DataFrame is a table. It contains an array of individual entries, each of which has a certain value. Each entry corresponds to a row (or record) and a column.

<img src='DataFrame.jpg'  height="400px" width="400px" style = "float:left"/>

* The <font color='red'><b> dictionary-list constructor </b></font> assigns values to the column labels

In [None]:
my_table = pd.DataFrame({'Name': ['Anastasia', 'Dima'], 'Score': [12.5, 9.0]})
my_table

* The list of row labels used in a DataFrame is known as an <font color='red'><b> Index </b></font> . We can assign values to it by using an index parameter in our constructor. 

In [None]:
my_table = pd.DataFrame({'Name': ['Anastasia', 'Dima'], 'Score': [12.5, 9.0]},index=['Student 1','Student 2'])
my_table

### 1.2. Series
A Series, by contrast, is a sequence of data values. If a DataFrame is a table, a Series is a list. And in fact you can create one with nothing more than a list:

In [None]:
pd.Series([1, 2, 3, 4, 5] , name='Test1')

* The Series and the DataFrame are intimately related. It's helpful to think of a DataFrame as actually being just a bunch of Series <font color='red'><b> glued together </b></font>.

In [None]:
one = pd.Series([1, 2, 3, 4, 5] , name='Test1')
two = pd.Series([6, 7, 8, 9, 10] , name='Test2')
test_table = pd.concat([one,two] , axis=1)
test_table

## 2. Reading data files

In [None]:

my_table = pd.read_csv('salesmonthly.csv')
# my_table

### 2.1. Useful attributes and methods

In [None]:
my_table.shape

In [None]:
my_table.head()

In [None]:
my_table.head(3)

In [None]:
my_table.tail(3)

In [None]:
my_table.sample(n=3)

In [None]:
my_table.sample(n=3)

In [None]:
my_table.sample(n=3, random_state=1)

In [None]:
my_table.sample(n=3, random_state=20)

In [None]:
my_table.sample(frac=0.03, random_state=2)

In [None]:
my_table.sample(frac=0.06, random_state=42)

In [None]:
my_table.info()

In [None]:
my_table.describe()

In [None]:
my_table.describe(include = 'all')

In [None]:
my_table.describe(percentiles=[0.125,0.17,0.05,0.6])

In [None]:
my_table_1 = my_table.copy()
my_table_1

In [None]:
my_table_2 = my_table
my_table_2

In [None]:
my_table_1.rename(columns = {'date': 'invoice_date'}, inplace=True)
my_table.head(3)

In [None]:
my_table_1.head(3)

In [None]:
my_table_2.rename(columns = {'date': 'invoice_date'}, inplace=True)
my_table_2

In [None]:
my_table.head(3)

### 2.2. Accessing DataFrame

In [None]:
my_table = pd.read_csv('salesmonthly.csv')
my_table.head(3)

In [None]:
# df['column']
my_table['product1']

In [None]:
#df.column
my_table.product1

In [None]:
my_table.product 8

In [None]:
# column name in Farsi
my_table[['product 8']]

In [None]:
my_table['product 8'][0]

In [None]:
my_table.product1[0]

In [None]:
my_table[['product1','product2']]

In [None]:
my_table.product1.describe()

In [None]:
my_table.product1.mean()

In [None]:
my_table.product1.max()

In [None]:
my_table.product1.argmax()

In [None]:
my_table.Customer.unique()

In [None]:
my_table.Customer.nunique()

In [None]:
#***
my_table.Customer.value_counts()

In [None]:
my_table.Customer.value_counts(normalize=True)*100

In [None]:
my_table.product1 - 100

In [None]:
my_table.product1 - my_table.product1.mean()

In [None]:
#Add column
my_table['Cus_length'] = my_table.Customer.apply(len)

In [None]:
my_table.head()

## 3. Indexing in Pandas

The indexing operator and attribute selection work just like the rest of the Python ecosystem.  
However, pandas has its own accessor operators, <font color='red'><b> loc </b></font> and <font color='red'><b> iloc </b></font> for more advanced operations.


### 3.1. Index-based selection
selecting data based on its numerical position in the data. <font color='red'><b> iloc </b></font> follows this paradigm.  

<img src='iloc-structure.png'  height="300px" width="300px" style = "float:left"/>

In [None]:
my_table.head()

In [None]:
# output: Series
my_table.iloc[1]

In [None]:
#output : DataFrame
my_table.iloc[[1]]

In [None]:
my_table.iloc[1,]

In [None]:
my_table.iloc[1:3,0:6]

In [None]:
my_table.iloc[[1,4,12],[1,3,7]]

In [None]:
# my_table.product1.max()
# my_table.product1.argmax()
my_table.iloc[[my_table.product1.argmax()]]

In [None]:
my_table.iloc[my_table.product1.argmax()]

In [None]:
my_table.iloc[-5:]

In [None]:
my_table[-5:]

In [None]:
my_table[:,1:7]

In [None]:
my_table.iloc[:,1:7]

In [None]:
my_table.head(3)

### 3.2. Label-based selection
This paradigm for attribute selection is the one followed by the <font color='red'><b> loc </b></font> operator: label-based selection. In this paradigm, it's the data index value, not its position, which matters.

In [None]:
my_table.head()

In [None]:
my_table.loc[0:3,'product1']

In [None]:
my_table.loc[0:3,['product1']]

In [None]:
my_table.iloc[0:3,[1]]

In [None]:
my_table_2 = pd.read_csv('salesmonthly.csv' , index_col='date') #index_col = 0
my_table_2

In [None]:
my_table_2.loc[['1/31/2014','3/31/2014'],['product1','product5']]

In [None]:
my_table_2.iloc[[0,2],[0,4]]

### 3.3 Choosing between loc and iloc
<font color='blue'><b> iloc </b></font> uses the Python stdlib indexing scheme, where the **first element of the range is included and the last one excluded.** So 0:10 will select entries 0,...,9. (10 entries)  
<font color='blue'><b> loc </b></font>,meanwhile, **indexes inclusively.** So 0:10 will select entries 0,...,10. (10+1 entries)

In [None]:
my_table_2.loc['1/31/2014':'5/31/2014','product1':'product4']

### 3.4 Manipulating the index

In [None]:
my_table = pd.read_csv('salesmonthly.csv')
my_table

In [None]:
my_table.set_index('Customer')

In [None]:
my_table.head(3)

In [None]:
# 1st way to set index (Creating new df)

edited_table = my_table.set_index('Customer')
edited_table

In [None]:
edited_table = edited_table.reset_index()
edited_table

In [None]:
# 2nd Way to set index (inplace)

my_table.set_index('Customer', inplace=True)
my_table.head()

In [None]:
my_table.reset_index(inplace=True)
my_table.head()

In [None]:
# if reset_index is applied twice an extra colum will be added
# run the code below to see extra column

my_table.reset_index(inplace=True)
my_table.head()

In [None]:
# The correct way to deal with set and reset index is as follows

my_table = pd.read_csv('salesmonthly.csv')
my_table.head(3)

In [None]:
my_table.set_index('Customer', inplace=True)
my_table.head()

In [None]:
my_table.reset_index(drop = True,inplace = True)
my_table.head()

In [None]:
my_table = pd.read_csv('salesmonthly.csv')
my_table.set_index('Customer', inplace=True)
my_table.head()

In [None]:
my_table.loc[['mammad'],['product1']]

## 4. Assigning
Assigning data to a DataFrame is easy. You can assign either a constant value or with an iterable of values

In [None]:
my_table = pd.read_csv('salesmonthly.csv')
my_table.head(3)

In [None]:
my_table['type'] = "Sales"
my_table

In [None]:
my_table.shape

In [None]:
len(my_table)

In [None]:
my_table['reverse_index'] = range(len(my_table), 0 , -1)
my_table

In [None]:
my_table['total1'] = my_table.product1 + my_table['product 8']
my_table.head()

In [None]:
baghali = my_table.loc[:,'product1':'product 8']
baghali

In [None]:
my_table['total2'] = my_table.loc[:,'product1':'product 8'].sum(axis = 1)
my_table.head()