<a href="https://colab.research.google.com/github/psg-dsci/nurds/blob/main/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# First way - default indices
series1 = pd.Series([1, 2, 3, 4, 5])

# Second way - custom indices
series2 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

# Third way - from dictionary
data = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
series3 = pd.Series(data)

# Fourth way - using a list with default indices
series4 = pd.Series([10, 20, 30, 40, 50])

print(series1)
print(series2)
print(series3)
print(series4)


0    1
1    2
2    3
3    4
4    5
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
0    10
1    20
2    30
3    40
4    50
dtype: int64


In [None]:
data = np.random.randn(5, 4)
df = pd.DataFrame(data, index=['A', 'B', 'C', 'D', 'E'], columns=['W', 'X', 'Y', 'Z'])
print(df)

          W         X         Y         Z
A -1.666933 -0.486213  0.715462  1.548926
B -0.001888 -0.217308  0.035424  0.018969
C  0.482270  0.363567 -0.686211  0.150149
D  0.310280 -1.778497 -0.054540 -1.324711
E -0.252969 -0.467639  2.396828 -1.981178


In [None]:
print(df['W'])  # or df.W
print(df[['W', 'Z']])


A   -1.666933
B   -0.001888
C    0.482270
D    0.310280
E   -0.252969
Name: W, dtype: float64
          W         Z
A -1.666933  1.548926
B -0.001888  0.018969
C  0.482270  0.150149
D  0.310280 -1.324711
E -0.252969 -1.981178


In [None]:
df['New'] = np.random.randn(5)
print(df)


          W         X         Y         Z       New
A -1.666933 -0.486213  0.715462  1.548926 -0.341464
B -0.001888 -0.217308  0.035424  0.018969  0.179963
C  0.482270  0.363567 -0.686211  0.150149 -0.029747
D  0.310280 -1.778497 -0.054540 -1.324711 -1.190185
E -0.252969 -0.467639  2.396828 -1.981178 -1.384579


In [None]:
# Dropping rows
df.drop('A', axis=0, inplace=True)

# Dropping columns
df.drop('New', axis=1, inplace=True)

print(df)


          W         X         Y         Z
B -0.001888 -0.217308  0.035424  0.018969
C  0.482270  0.363567 -0.686211  0.150149
D  0.310280 -1.778497 -0.054540 -1.324711
E -0.252969 -0.467639  2.396828 -1.981178


In [None]:
print(df.loc[['B', 'C'], ['W', 'Y']])
print(df.iloc[[1, 2], [0, 2]])


          W         Y
B -0.001888  0.035424
C  0.482270 -0.686211
         W         Y
C  0.48227 -0.686211
D  0.31028 -0.054540


In [None]:
print(df.reset_index())
print(df.set_index('X'))


  index         W         X         Y         Z
0     B -0.001888 -0.217308  0.035424  0.018969
1     C  0.482270  0.363567 -0.686211  0.150149
2     D  0.310280 -1.778497 -0.054540 -1.324711
3     E -0.252969 -0.467639  2.396828 -1.981178
                  W         Y         Z
X                                      
-0.217308 -0.001888  0.035424  0.018969
 0.363567  0.482270 -0.686211  0.150149
-1.778497  0.310280 -0.054540 -1.324711
-0.467639 -0.252969  2.396828 -1.981178


In [None]:
# Creating a DataFrame with hierarchical indices
df = pd.DataFrame(np.random.randn(6, 4), index=pd.MultiIndex.from_tuples([('G1', 1), ('G1', 2), ('G2', 1), ('G2', 2), ('G3', 1), ('G3', 2)]), columns=['A', 'B', 'C', 'D'])

print(df)


             A         B         C         D
G1 1  0.861947  1.011935 -0.137301 -0.265021
   2 -0.300259  0.158694 -0.352546  0.492953
G2 1  0.564322  0.509015 -2.202127  0.348509
   2  1.077482 -1.084256  0.958016 -0.028033
G3 1 -0.367900  0.868599  1.164602  0.167400
   2  0.423387  1.284925 -1.314393 -1.739271


In [None]:
# Dropping rows with null values
print(df.dropna())

# Filling null values with mean
print(df.fillna(df.mean()))


             A         B         C         D
G1 1  0.861947  1.011935 -0.137301 -0.265021
   2 -0.300259  0.158694 -0.352546  0.492953
G2 1  0.564322  0.509015 -2.202127  0.348509
   2  1.077482 -1.084256  0.958016 -0.028033
G3 1 -0.367900  0.868599  1.164602  0.167400
   2  0.423387  1.284925 -1.314393 -1.739271
             A         B         C         D
G1 1  0.861947  1.011935 -0.137301 -0.265021
   2 -0.300259  0.158694 -0.352546  0.492953
G2 1  0.564322  0.509015 -2.202127  0.348509
   2  1.077482 -1.084256  0.958016 -0.028033
G3 1 -0.367900  0.868599  1.164602  0.167400
   2  0.423387  1.284925 -1.314393 -1.739271


In [None]:
# Grouping by 'Company' column and taking mean
print(df.groupby('Company').mean())

# Concatenating two DataFrames
df1 = pd.DataFrame({'A': ['A1', 'A2', 'A3'], 'B': ['B1', 'B2', 'B3']})
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6'], 'B': ['B4', 'B5', 'B6']})
print(pd.concat([df1, df2]))


KeyError: 'Company'

In [None]:
# Finding unique values in a column
print(df['Column'].unique())

# Finding number of unique values in a column
print(df['Column'].nunique())

# Applying a function to DataFrame values
print(df.apply(np.sqrt))

# Sorting DataFrame by a column
print(df.sort_values('Column'))

# Checking for null values
print(df.isnull())

# Creating a pivot table
print(pd.pivot_table(df, values='Values', index='Index', columns='Columns'))

# Sampling from DataFrame
print(df.sample(5))
