# Pandas

[Pandas](https://pandas.pydata.org/) is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.

### Installation:

<div class="alert alert-block alert-warning">
$ pip install pandas
</div>

In [1]:
! pip install pandas



In [2]:
import pandas as pd

# Series

Defining a Series

In [3]:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj[1]

7

In [6]:
obj.values

array([ 4,  7, -5,  3])

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])

In [9]:
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [12]:
obj2['b']

7

In [13]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [14]:
obj2['d'] = -3
obj2

a    4
b    7
c   -5
d   -3
dtype: int64

In [16]:
obj2[['b', 'c']]

b    7
c   -5
dtype: int64

In [17]:
obj2[obj2>0]

a    4
b    7
dtype: int64

In [23]:
pd.Series([1000, 1001, 1002, 1001, 1002], index=pd.date_range("2018-01-01", periods=5, freq="2D"))

2018-01-01    1000
2018-01-03    1001
2018-01-05    1002
2018-01-07    1001
2018-01-09    1002
Freq: 2D, dtype: int64

# DataFrame

Defining a DataFrame using dictionary

In [19]:
sdata = {'Ohio': [3500, 4000],
         'Tehran': [5000, 3200],
         'Utah': [2030, 3030]}
df = pd.DataFrame(sdata)

In [20]:
df

Unnamed: 0,Ohio,Tehran,Utah
0,3500,5000,2030
1,4000,3200,3030


In [24]:
import numpy as np

In [29]:
arr = np.random.uniform(0, 1000, (100, 6))

In [32]:
df = pd.DataFrame(arr, columns=['a', 'b', 'c', 'd', 'e', 'f'])

# Viewing data

+ head and tail
+ index and columns
+ to_numpy
+ describe and info
+ sorting

In [34]:
df.head(12)

Unnamed: 0,a,b,c,d,e,f
0,252.978743,283.779307,67.866077,448.775774,874.436497,152.516189
1,750.661393,250.442725,334.22113,623.139169,159.391474,715.707957
2,8.779598,833.145728,122.605304,456.812921,557.341705,701.604572
3,52.993377,4.640474,238.707753,744.304726,195.705725,358.242964
4,304.762086,990.485113,597.410274,372.62101,321.062138,242.275668
5,567.248569,614.875342,28.579365,37.194229,364.425348,824.476525
6,20.013374,76.959774,314.293344,874.434973,864.104467,994.01403
7,51.966211,908.137531,797.543221,670.052057,197.197934,155.670844
8,910.760143,551.358996,598.394414,189.637742,151.382031,952.691065
9,335.744964,506.889128,639.561961,645.794871,245.463964,550.709499


In [35]:
df.tail()

Unnamed: 0,a,b,c,d,e,f
95,502.896728,360.044236,357.973938,56.17164,85.166332,720.667804
96,511.580495,970.687371,857.60158,561.638891,456.824636,588.579954
97,301.782906,81.695747,732.595398,43.035801,996.791783,617.495754
98,87.9999,611.332882,948.207611,393.782958,139.855947,639.687342
99,255.056085,20.148866,153.525874,805.53149,969.594018,795.730755


In [36]:
df.tail(3)

Unnamed: 0,a,b,c,d,e,f
97,301.782906,81.695747,732.595398,43.035801,996.791783,617.495754
98,87.9999,611.332882,948.207611,393.782958,139.855947,639.687342
99,255.056085,20.148866,153.525874,805.53149,969.594018,795.730755


In [37]:
df.index

RangeIndex(start=0, stop=100, step=1)

In [38]:
df.columns

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [39]:
df['c']

0      67.866077
1     334.221130
2     122.605304
3     238.707753
4     597.410274
         ...    
95    357.973938
96    857.601580
97    732.595398
98    948.207611
99    153.525874
Name: c, Length: 100, dtype: float64

In [41]:
df[['c', 'e']].tail(6)

Unnamed: 0,c,e
94,817.176923,93.092115
95,357.973938,85.166332
96,857.60158,456.824636
97,732.595398,996.791783
98,948.207611,139.855947
99,153.525874,969.594018


In [42]:
df[['c', 'e']].head(6)

Unnamed: 0,c,e
0,67.866077,874.436497
1,334.22113,159.391474
2,122.605304,557.341705
3,238.707753,195.705725
4,597.410274,321.062138
5,28.579365,364.425348


In [51]:
df.iloc[1, :]

a    750.661393
b    250.442725
c    334.221130
d    623.139169
e    159.391474
f    715.707957
Name: 1, dtype: float64

In [52]:
df.iloc[:, 1]

0     283.779307
1     250.442725
2     833.145728
3       4.640474
4     990.485113
         ...    
95    360.044236
96    970.687371
97     81.695747
98    611.332882
99     20.148866
Name: b, Length: 100, dtype: float64

In [56]:
df.iloc[30:60, [1, 3]]

Unnamed: 0,b,d
30,768.643182,906.279342
31,984.659316,700.042491
32,636.926984,92.76932
33,521.332753,649.877577
34,213.749876,430.047075
35,470.585544,544.782564
36,479.615856,253.019097
37,228.262723,317.416977
38,385.0017,771.405671
39,191.340666,655.981224


In [59]:
df.loc[2]

a      8.779598
b    833.145728
c    122.605304
d    456.812921
e    557.341705
f    701.604572
Name: 2, dtype: float64

In [61]:
df.loc[30:60, ['a', 'c']]

Unnamed: 0,a,c
30,776.092976,366.428476
31,76.565772,119.838037
32,855.26507,360.57248
33,405.156378,64.208835
34,734.700014,94.182498
35,71.097584,826.075063
36,345.54395,113.463766
37,914.622362,370.4397
38,78.813865,144.621362
39,370.983008,649.75923


In [62]:
df.describe()

Unnamed: 0,a,b,c,d,e,f
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,462.521996,472.737233,483.604911,529.064522,424.35865,523.23984
std,273.801082,287.193835,284.852867,278.634726,274.715691,274.546273
min,0.601483,4.032009,6.345523,1.798502,0.442964,5.686224
25%,251.892849,224.634511,229.922596,313.747223,194.67182,307.063041
50%,462.421947,471.748283,450.565126,581.648048,390.306545,567.722827
75%,696.455484,677.875583,742.042353,746.523581,639.390984,742.108917
max,987.102533,999.328741,954.607638,992.169763,996.791783,994.879281


In [72]:
df['s'] = [str(i) for i in range(1, 101)]
df['i'] = [i for i in range(1, 101)]

In [73]:
df

Unnamed: 0,a,b,c,d,e,f,s,i
0,252.978743,283.779307,67.866077,448.775774,874.436497,152.516189,1,1
1,750.661393,250.442725,334.221130,623.139169,159.391474,715.707957,2,2
2,8.779598,833.145728,122.605304,456.812921,557.341705,701.604572,3,3
3,52.993377,4.640474,238.707753,744.304726,195.705725,358.242964,4,4
4,304.762086,990.485113,597.410274,372.621010,321.062138,242.275668,5,5
...,...,...,...,...,...,...,...,...
95,502.896728,360.044236,357.973938,56.171640,85.166332,720.667804,96,96
96,511.580495,970.687371,857.601580,561.638891,456.824636,588.579954,97,97
97,301.782906,81.695747,732.595398,43.035801,996.791783,617.495754,98,98
98,87.999900,611.332882,948.207611,393.782958,139.855947,639.687342,99,99


In [74]:
df.describe()

Unnamed: 0,a,b,c,d,e,f,i
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,462.521996,472.737233,483.604911,529.064522,424.35865,523.23984,50.5
std,273.801082,287.193835,284.852867,278.634726,274.715691,274.546273,29.011492
min,0.601483,4.032009,6.345523,1.798502,0.442964,5.686224,1.0
25%,251.892849,224.634511,229.922596,313.747223,194.67182,307.063041,25.75
50%,462.421947,471.748283,450.565126,581.648048,390.306545,567.722827,50.5
75%,696.455484,677.875583,742.042353,746.523581,639.390984,742.108917,75.25
max,987.102533,999.328741,954.607638,992.169763,996.791783,994.879281,100.0


In [76]:
df.iloc[3:6, [1, 3]] = None
df.iloc[10:17, [2, 4]] = None

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       100 non-null    float64
 1   b       97 non-null     float64
 2   c       93 non-null     float64
 3   d       97 non-null     float64
 4   e       93 non-null     float64
 5   f       100 non-null    float64
 6   s       100 non-null    object 
 7   i       100 non-null    int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 6.4+ KB


In [98]:
dic = {'name':['Amir', 'Deli', 'Ali', 'Hosei', 'Danial', 'Amir'],
       'age': [25, 23, 33, 37, 24, 24],
       'phone':[12345678, 3249837, 1234567, 98765, 5678, 30428393]}
data = pd.DataFrame(dic)

In [99]:
data

Unnamed: 0,name,age,phone
0,Amir,25,12345678
1,Deli,23,3249837
2,Ali,33,1234567
3,Hosei,37,98765
4,Danial,24,5678
5,Amir,24,30428393


In [105]:
# data = data.sort_values(['name', 'age'], ascending=False, inplace=True).reset_index(drop=True)
data.sort_values(['name', 'age'], ascending=False, inplace=True)
data.reset_index(drop=True, inplace=True)

# Missing data

+ isna and isnull
+ dropna and fillna

In [151]:
np.sum(df.isna())

a    0
b    3
c    7
d    3
e    7
f    0
s    0
i    0
dtype: int64

In [154]:
np.sum(df.isnull())

a    0
b    3
c    7
d    3
e    7
f    0
s    0
i    0
dtype: int64

In [155]:
df.fillna(-1)

Unnamed: 0,a,b,c,d,e,f,s,i
0,252.978743,283.779307,67.866077,448.775774,874.436497,152.516189,1,1
1,750.661393,250.442725,334.221130,623.139169,159.391474,715.707957,2,2
2,8.779598,833.145728,122.605304,456.812921,557.341705,701.604572,3,3
3,52.993377,-1.000000,238.707753,-1.000000,195.705725,358.242964,4,4
4,304.762086,-1.000000,597.410274,-1.000000,321.062138,242.275668,5,5
...,...,...,...,...,...,...,...,...
95,502.896728,360.044236,357.973938,56.171640,85.166332,720.667804,96,96
96,511.580495,970.687371,857.601580,561.638891,456.824636,588.579954,97,97
97,301.782906,81.695747,732.595398,43.035801,996.791783,617.495754,98,98
98,87.999900,611.332882,948.207611,393.782958,139.855947,639.687342,99,99


In [158]:
df.dropna(inplace=True)

In [159]:
np.sum(df.isnull())

a    0
b    0
c    0
d    0
e    0
f    0
s    0
i    0
dtype: int64

# Operations

+ mean
+ std
+ apply

In [161]:
df.mean(axis=1, numeric_only=True)

0     297.336084
1     405.080550
2     383.327118
6     450.117137
7     398.366828
         ...    
95    311.274383
96    577.701847
97    410.199627
98    417.123806
99    442.798156
Length: 90, dtype: float64

In [163]:
df.var(numeric_only=True)

a    77025.849837
b    82651.346397
c    83510.513339
d    79949.060654
e    77503.806509
f    76308.450093
i      742.147441
dtype: float64

In [164]:
df.std(numeric_only=True)

a    277.535313
b    287.491472
c    288.981856
d    282.752649
e    278.395055
f    276.239842
i     27.242383
dtype: float64

In [165]:
df.sum(numeric_only=True)

a    41639.029176
b    41456.521004
c    44354.891634
d    48382.994492
e    39230.203894
f    46528.560874
i     4937.000000
dtype: float64

In [167]:
df.min()

a    0.601483
b    4.032009
c    6.345523
d    1.798502
e    0.442964
f    5.686224
s           1
i           1
dtype: object

In [180]:
df['h'] = True

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 0 to 99
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       90 non-null     float64
 1   b       90 non-null     float64
 2   c       90 non-null     float64
 3   d       90 non-null     float64
 4   e       90 non-null     float64
 5   f       90 non-null     float64
 6   s       90 non-null     object 
 7   i       90 non-null     int64  
 8   h       90 non-null     bool   
dtypes: bool(1), float64(6), int64(1), object(1)
memory usage: 6.4+ KB


In [201]:
def f(x):
    float_type = [np.float64, np.float32, np.float16]
    int_type = [np.int8, np.int16, np.int32, np.int64]
    if x.dtype in float_type or x.dtype in int_type:
        return np.max(x) - np.mean(x)
    else:
        return None

In [202]:
df.apply(f)


a    524.446654
b    538.700730
c    461.775509
d    454.580935
e    560.900628
f    477.895272
s           NaN
i     45.144444
h           NaN
dtype: float64

In [196]:
new_df.mean(numeric_only=True)

a   -7.326485e-14
b   -1.263187e-15
c    6.315935e-15
d    1.035813e-13
e    2.400055e-14
f    5.305386e-14
i    1.263187e-15
h    1.000000e+00
dtype: float64

# Merging

+ concat
+ merge
+ append
+ join

In [117]:
data.append(pd.DataFrame({'name':['Rose'],
                          'age': [23],
                          'phone':[123456]})).reset_index(drop=True)

  data.append(pd.DataFrame({'name':['Rose'],


Unnamed: 0,name,age,phone
0,Hosei,37,98765
1,Deli,23,3249837
2,Danial,24,5678
3,Amir,25,12345678
4,Amir,24,30428393
5,Ali,33,1234567
6,Rose,23,123456


In [119]:
dic = {'name':['Amir', 'Deli', 'Ali', 'Hosei', 'Danial', 'Amir'],
       'age': [25, 23, 33, 37, 24, 24],
       'phone':[12345678, 3249837, 1234567, 98765, 5678, 30428393]}
data1 = pd.DataFrame(dic)

In [120]:
dic = {'name':['Reza', 'Nazi'],
       'age': [25, 23],
       'phone':[2304032, 3284989]}
data2 = pd.DataFrame(dic)

In [125]:
dic = {'name':['Gholi', 'abdola'],
       'age': [52, 43],
       'phone':[32923, 323242]}
data3 = pd.DataFrame(dic)

In [127]:
pd.concat([data1, data2, data3]).reset_index(drop=True)

Unnamed: 0,name,age,phone
0,Amir,25,12345678
1,Deli,23,3249837
2,Ali,33,1234567
3,Hosei,37,98765
4,Danial,24,5678
5,Amir,24,30428393
6,Reza,25,2304032
7,Nazi,23,3284989
8,Gholi,52,32923
9,abdola,43,323242


In [128]:
dic = {'name':['Amir', 'Deli', 'Ali', 'Hosei', 'Danial'],
       'age': [25, 23, 33, 37, 24, ]}
data1 = pd.DataFrame(dic)

In [132]:
data1

Unnamed: 0,name,age
0,Amir,25
1,Deli,23
2,Ali,33
3,Hosei,37
4,Danial,24


In [141]:
dic = {'name':['Amir', 'Deli', 'Ali', 'Hosei', 'Danial', 'Mirza'],
       'weight': [76, 56, 93, 69, 120, 89],
       'height': [180, 160, 170, 156, 195, 170]}
data2 = pd.DataFrame(dic)

In [142]:
data2

Unnamed: 0,name,weight,height
0,Amir,76,180
1,Deli,56,160
2,Ali,93,170
3,Hosei,69,156
4,Danial,120,195
5,Mirza,89,170


In [143]:
pd.merge(data1, data2, on='name')

Unnamed: 0,name,age,weight,height
0,Amir,25,76,180
1,Deli,23,56,160
2,Ali,33,93,170
3,Hosei,37,69,156
4,Danial,24,120,195


In [144]:
pd.merge(data2, data1, on='name')

Unnamed: 0,name,weight,height,age
0,Amir,76,180,25
1,Deli,56,160,23
2,Ali,93,170,33
3,Hosei,69,156,37
4,Danial,120,195,24


In [148]:
data2.join(data1.set_index('name'), on='name')

Unnamed: 0,name,weight,height,age
0,Amir,76,180,25.0
1,Deli,56,160,23.0
2,Ali,93,170,33.0
3,Hosei,69,156,37.0
4,Danial,120,195,24.0
5,Mirza,89,170,


# Reading and Writing

+ read_csv and to_csv
+ read_excel

In [203]:
data2.to_csv('data2.csv')

In [206]:
loaded_data = pd.read_csv('data2.csv', index_col=0)

In [207]:
loaded_data

Unnamed: 0,name,weight,height
0,Amir,76,180
1,Deli,56,160
2,Ali,93,170
3,Hosei,69,156
4,Danial,120,195
5,Mirza,89,170


In [209]:
data = pd.read_excel('/Users/khamir/Desktop/tDCS-Anxiety/BART/BART.xlsx', engine='openpyxl')

In [210]:
data

Unnamed: 0,Name,Age,Gender,Sideness,Arrangement,Diagnosis,BARTshmAccount,BARTshmPN,BARTshmSUC,BARTshmAV,...,BARTrarmT21,BARTrarmT22,BARTrarmT23,BARTrarmT24,BARTrarmT25,BARTrarmT26,BARTrarmT27,BARTrarmT28,BARTrarmT29,BARTrarmT30
0,M.Reza,18,2,1,1,1,416000,10,20,20.8,...,10.94,13.64,15.74,15.79,12.44,15.41,0.85,0.63,0.75,0.47
1,Mahdiyeh,22,1,2,2,1,400000,9,21,19.0476,...,12.67,48.0,25.38,31.641,45.94,57.67,23.75,57.82,13.07,72.62
2,Fatemeh,31,1,1,3,1,488000,5,25,19.52,...,13.92,9.44,12.42,16.56,11.8,4.08,9.35,4.89,9.95,12.31
3,Masumeh,35,1,1,4,3,912000,7,23,39.6522,...,5.2,3.42,5.73,15.16,5.43,12.33,12.88,11.48,12.82,9.61
4,A.Mohamd,17,2,1,5,1,201000,0,30,6.7,...,4.431,4.586,4.664,3.698,4.992,0.078,4.914,3.26,3.666,5.554
5,Fariba,19,1,1,6,1,816000,7,23,35.4783,...,12.57,16.52,17.89,9.89,3.52,18.38,1.15,12.86,16.67,16.08
6,Zahra,45,1,1,7,2,489000,2,28,17.4643,...,22.35,0.06,21.96,0.06,8.38,14.51,0.07,16.67,0.07,0.36
7,Faezeh.A,27,1,1,9,1,457000,8,22,20.7727,...,11.43,6.26,15.43,16.5,9.94,7.7,6.36,12.471,17.57,14.15
8,Akram,36,1,1,10,4,862000,7,23,37.4783,...,14.5,12.63,8.69,17.3,13.43,14.03,9.91,3.93,1.09,3.95
9,Hamidrez,21,2,1,11,1,823000,12,18,45.7222,...,16.66,16.51,6.13,24.91,15.5,13.75,14.41,24.58,12.81,17.22
