In [1]:
import numpy as np
import pandas as pd

# Series
you can convert list, numpy, or dictionary to a Series

In [7]:
#create Series
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])      

In [8]:
#Show first three rows
ser1.head(3)

USA        1
Germany    2
USSR       3
dtype: int64

In [9]:
ser1['USA']

1

# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [10]:
X=np.random.randn(5,4)
X

array([[-2.26226282,  0.39404687, -1.19109455, -0.54236684],
       [ 1.10391457, -0.33810318, -0.35988977, -0.10828821],
       [ 1.61762682,  0.68277926, -0.81792198,  0.99912081],
       [ 1.75941759, -2.04456978,  0.06537166, -0.2345489 ],
       [-0.1889991 , -0.52164626,  1.65940145,  0.95557213]])

In [11]:
index=['A','B','C','D','E']
index

['A', 'B', 'C', 'D', 'E']

In [12]:
columns=['W','X','Y','Z']
columns

['W', 'X', 'Y', 'Z']

In [14]:
#create DataFrame
df = pd.DataFrame(data=X,index=index,columns=columns)

df.head()

Unnamed: 0,W,X,Y,Z
A,-2.262263,0.394047,-1.191095,-0.542367
B,1.103915,-0.338103,-0.35989,-0.108288
C,1.617627,0.682779,-0.817922,0.999121
D,1.759418,-2.04457,0.065372,-0.234549
E,-0.188999,-0.521646,1.659401,0.955572


# Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [15]:
df['W']

A   -2.262263
B    1.103915
C    1.617627
D    1.759418
E   -0.188999
Name: W, dtype: float64

In [16]:
# Pass a list of column names
df[['W','Z']]

Unnamed: 0,W,Z
A,-2.262263,-0.542367
B,1.103915,-0.108288
C,1.617627,0.999121
D,1.759418,-0.234549
E,-0.188999,0.955572


### Creating a new column:

In [25]:
df['new'] = df['W'] + df['Y']

df.head()

Unnamed: 0,W,X,Y,Z,new
A,-2.262263,0.394047,-1.191095,-0.542367,-3.453357
B,1.103915,-0.338103,-0.35989,-0.108288,0.744025
C,1.617627,0.682779,-0.817922,0.999121,0.799705
D,1.759418,-2.04457,0.065372,-0.234549,1.824789


### Removing Columns

In [26]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,-2.262263,0.394047,-1.191095,-0.542367
B,1.103915,-0.338103,-0.35989,-0.108288
C,1.617627,0.682779,-0.817922,0.999121
D,1.759418,-2.04457,0.065372,-0.234549


In [27]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new
A,-2.262263,0.394047,-1.191095,-0.542367,-3.453357
B,1.103915,-0.338103,-0.35989,-0.108288,0.744025
C,1.617627,0.682779,-0.817922,0.999121,0.799705
D,1.759418,-2.04457,0.065372,-0.234549,1.824789


In [28]:
df.drop('new',axis=1,inplace=True)

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,-2.262263,0.394047,-1.191095,-0.542367
B,1.103915,-0.338103,-0.35989,-0.108288
C,1.617627,0.682779,-0.817922,0.999121
D,1.759418,-2.04457,0.065372,-0.234549


Can also drop rows this way:

In [33]:
df.drop('E',axis=0)

KeyError: "['E'] not found in axis"

### Selecting Rows

In [16]:
#Row of A
df.loc['A']

W   -0.166527
X   -0.728846
Y    0.593888
Z    0.908652
Name: A, dtype: float64

##### Or select based off of position instead of label 

In [17]:
#Row of index two (الصف الثالث)
df.iloc[2]

W    0.222940
X   -1.837276
Y    0.781836
Z    1.070257
Name: C, dtype: float64

In [18]:
df.iloc[:2,:1]

Unnamed: 0,W
A,-0.166527
B,-1.901168


### Selecting subset of rows and columns

In [19]:
df.loc['B','Y']

0.33463185901936837

In [20]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-0.166527,0.593888
B,-1.901168,0.334632


# Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [21]:
df

Unnamed: 0,W,X,Y,Z
A,-0.166527,-0.728846,0.593888,0.908652
B,-1.901168,-0.767672,0.334632,-1.482718
C,0.22294,-1.837276,0.781836,1.070257
D,-0.554058,-0.934997,0.723236,-0.367784
E,-0.663096,-0.127525,-0.451881,0.547473


In [22]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,True,True
B,False,False,True,False
C,True,False,True,True
D,False,False,True,False
E,False,False,False,True


In [23]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,0.593888,0.908652
B,,,0.334632,
C,0.22294,,0.781836,1.070257
D,,,0.723236,
E,,,,0.547473


In [24]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
C,0.22294,-1.837276,0.781836,1.070257


In [25]:
df[df['W']>0]['Y']

C    0.781836
Name: Y, dtype: float64

### For two conditions you can use | and & with parenthesis:

In [26]:
df[(df['W']>0) & (df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z
C,0.22294,-1.837276,0.781836,1.070257


# More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [27]:
newind = ['CA', 'NY', 'WY', 'OR', 'CO']
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [28]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,-0.166527,-0.728846,0.593888,0.908652,CA
B,-1.901168,-0.767672,0.334632,-1.482718,NY
C,0.22294,-1.837276,0.781836,1.070257,WY
D,-0.554058,-0.934997,0.723236,-0.367784,OR
E,-0.663096,-0.127525,-0.451881,0.547473,CO


In [29]:
#Set index
df.set_index('States',inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.166527,-0.728846,0.593888,0.908652
NY,-1.901168,-0.767672,0.334632,-1.482718
WY,0.22294,-1.837276,0.781836,1.070257
OR,-0.554058,-0.934997,0.723236,-0.367784
CO,-0.663096,-0.127525,-0.451881,0.547473


# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [30]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [31]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [32]:
df.isna().sum()

A    1
B    2
C    0
dtype: int64

In [33]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [34]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [35]:
df.dropna() #inplace=True

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [40]:
df.reset_index(drop=True,inplace=True)

In [41]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [36]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [37]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [38]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [39]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# duplicated 

Let's show a few convenient methods to deal with duplicated Data in pandas:

In [41]:
df = pd.DataFrame({'A':[1,1,15,1],
                  'B':[2,2,11,2],
                  'C':[3,3,7,2],
                  'C':[3,3,7,3]})

df

Unnamed: 0,A,B,C
0,1,2,3
1,1,2,3
2,15,11,7
3,1,2,3


In [42]:
df.duplicated()

0    False
1     True
2    False
3     True
dtype: bool

In [43]:
df.duplicated().sum()

2

In [44]:
df.drop_duplicates()

Unnamed: 0,A,B,C
0,1,2,3
2,15,11,7


# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [12]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### Info on Unique Values

In [14]:
df['col2'].unique()

array([444, 555, 666], dtype=int64)

In [15]:
df['col2'].nunique()

3

In [16]:
df['col2'].value_counts()

col2
444    2
555    1
666    1
Name: count, dtype: int64

In [17]:
df['col2'].value_counts().sort_values()

col2
555    1
666    1
444    2
Name: count, dtype: int64

# Applying Functions

In [18]:
def times2(x):
    return x*2

In [19]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [20]:
df['col1'].apply(lambda x:x*2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

### Get column and index names:

In [53]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [54]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      int64 
 1   col2    4 non-null      int64 
 2   col3    4 non-null      object
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes


In [56]:
df.describe()

Unnamed: 0,col1,col2
count,4.0,4.0
mean,2.5,527.25
std,1.290994,106.274409
min,1.0,444.0
25%,1.75,444.0
50%,2.5,499.5
75%,3.25,582.75
max,4.0,666.0


### Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [25]:
import pandas as pd

# Create the DataFrame
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
        'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
        'Sales': [200, 120, 340, 124, 243, 350]}

df = pd.DataFrame(data)

df
df[df['Company'] == 'GOOG']

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


#### Example 1: Group by Company and Calculate Sum of Sales
You can use the groupby() function along with an aggregation function like sum() to group the data by company and calculate the total sales for each company.

In [28]:
# Group by 'Company' and sum the sales
company_sales_sum = df.groupby('Company').sum()
print(company_sales_sum)


             Person  Sales
Company                   
FB        CarlSarah    593
GOOG     SamCharlie    320
MSFT     AmyVanessa    464


#### Example 2: Group by Company and Calculate Mean of Sales
You can also calculate the average sales for each company.

In [54]:
# Group by 'Company' and calculate mean sales
company_sales_mean = df.groupby('Company')['Sales'].mean()
print(company_sales_mean)


Company
FB      296.5
GOOG    160.0
MSFT    232.0
Name: Sales, dtype: float64


#### Example 3: Group by Company and Get Count of Entries
If you want to count how many entries (i.e., how many people) are there for each company, you can use the count() function.

In [47]:
# Group by 'Company' and count the number of entries
company_sales_count = df.groupby('Company').count()
print(company_sales_count)


         Person  Sales
Company               
FB            2      2
GOOG          2      2
MSFT          2      2


#### Example 4: Group by Company and Get Maximum Sales
You can also use the max() function to get the maximum sales for each company.

In [48]:
# Group by 'Company' and find the maximum sales value
company_sales_max = df.groupby('Company').max()
print(company_sales_max)


          Person  Sales
Company                
FB         Sarah    350
GOOG         Sam    200
MSFT     Vanessa    340


#### Example 5: Group by Company and Apply Multiple Aggregation Functions
You can apply multiple aggregation functions at once using the agg() function.

In [53]:
# Group by 'Company' and apply multiple aggregation functions only to the 'Sales' column
company_sales_agg = df.groupby('Company')['Sales'].agg(['sum', 'mean', 'max'])
print(company_sales_agg)


         sum   mean  max
Company                 
FB       593  296.5  350
GOOG     320  160.0  200
MSFT     464  232.0  340


In [56]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [57]:
df.groupby('Company').describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [58]:
df.groupby('Company').describe().transpose()['GOOG']

Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

#### Concatenation

Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use **pd.concat** and pass in a list of DataFrames to concatenate together:

In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [3]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7]) 

In [4]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [5]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [6]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [7]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [8]:
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [9]:
w = pd.concat([df1,df2,df3],axis=1)

In [11]:
w

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [10]:
w['A']

Unnamed: 0,A,A.1,A.2
0,A0,,
1,A1,,
2,A2,,
3,A3,,
4,,A4,
5,,A5,
6,,A6,
7,,A7,
8,,,A8
9,,,A9


## Select data type

In [35]:
import pandas as pd

# Sample DataFrame with different data types
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [24, 27, 22, 32],
    'height': [5.5, 6.0, 5.8, 5.7],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
    'is_student': [True, False, True, False]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,age,height,city,is_student
0,Alice,24,5.5,New York,True
1,Bob,27,6.0,Los Angeles,False
2,Charlie,22,5.8,Chicago,True
3,David,32,5.7,Houston,False


In [36]:
# 1. Selecting only numeric columns
numeric_df = df.select_dtypes(include='number')
print("Numeric columns:\n", numeric_df)


Numeric columns:
    age  height
0   24     5.5
1   27     6.0
2   22     5.8
3   32     5.7


In [37]:
# 2. Selecting only object (string) columns
object_df = df.select_dtypes(include='object')
print("\nObject columns:\n", object_df)


Object columns:
       name         city
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston


In [38]:
# 3. Excluding boolean columns
no_bool_df = df.select_dtypes(exclude='bool')
print("\nDataFrame without boolean columns:\n", no_bool_df)


DataFrame without boolean columns:
       name  age  height         city
0    Alice   24     5.5     New York
1      Bob   27     6.0  Los Angeles
2  Charlie   22     5.8      Chicago
3    David   32     5.7      Houston


In [39]:
# 4. Selecting only numeric columns
numeric_object_df = df.select_dtypes(include=['number','object'])
print("Numeric object columns:\n", numeric_object_df)


Numeric object columns:
       name  age  height         city
0    Alice   24     5.5     New York
1      Bob   27     6.0  Los Angeles
2  Charlie   22     5.8      Chicago
3    David   32     5.7      Houston


## Change type column

In [1]:
# Sample DataFrame
data = {'age': [24.0, 27.0, 22.0, 32.0]}
df = pd.DataFrame(data)

# Convert 'age' column to integer type
df['age'] = df['age'].astype(int)
print(df.dtypes)


age    int32
dtype: object


In [2]:
df['age'] = df['age'].astype(str)
print(df.dtypes)


age    object
dtype: object


In [3]:
df['age'] = df['age'].astype(bool)
print(df.dtypes)


age    bool
dtype: object


## Date time 

In [24]:
# Create a DataFrame with a date column in string format
data = {
    'Event': ['Event1', 'Event2', 'Event3'],
    'Date': ['2024-11-01', '2024-11-02', '2024-11-03'] , # Dates as strings
    'Date1': ['2024/11/01', '2024/11/02', '2024/11/03']  # Dates as strings
}
df = pd.DataFrame(data)

df


Unnamed: 0,Event,Date,Date1
0,Event1,2024-11-01,2024/11/01
1,Event2,2024-11-02,2024/11/02
2,Event3,2024-11-03,2024/11/03


In [25]:
# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
df['Date1'] = pd.to_datetime(df['Date1'])
df

Unnamed: 0,Event,Date,Date1
0,Event1,2024-11-01,2024-11-01
1,Event2,2024-11-02,2024-11-02
2,Event3,2024-11-03,2024-11-03


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Event   3 non-null      object        
 1   Date    3 non-null      datetime64[ns]
 2   Date1   3 non-null      datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 204.0+ bytes


In [28]:
# Extract day, month, and year into separate columns
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

In [29]:
# Extract day, month, and year into separate columns
df['Day1'] = df['Date1'].dt.day
df['Month1'] = df['Date1'].dt.month
df['Year1'] = df['Date1'].dt.year

In [30]:
df

Unnamed: 0,Event,Date,Date1,Day1,Month1,Year1,Day,Month,Year
0,Event1,2024-11-01,2024-11-01,1,11,2024,1,11,2024
1,Event2,2024-11-02,2024-11-02,2,11,2024,2,11,2024
2,Event3,2024-11-03,2024-11-03,3,11,2024,3,11,2024


##  Replace Function

In [6]:
# Sample DataFrame
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
        'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)

# Replace 'New York' with 'NYC' in the 'city' column
df['city'] = df['city'].replace('New York', 'NYC')
print(df)


      name         city
0    Alice          NYC
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston


In [7]:
# Replace 'New York' with 'NYC' and 'Los Angeles' with 'LA'
df['city'] = df['city'].replace({'New York': 'NYC', 'Los Angeles': 'LA'})
print(df)


      name     city
0    Alice      NYC
1      Bob       LA
2  Charlie  Chicago
3    David  Houston


In [8]:
# Replace both 'New York' and 'Los Angeles' with 'Unknown'
df['city'] = df['city'].replace(['New York', 'Los Angeles'], 'Unknown')
print(df)


      name     city
0    Alice      NYC
1      Bob       LA
2  Charlie  Chicago
3    David  Houston


In [9]:
# Replace 'Alice' with 'Alicia' and 'David' with 'Dave' across the DataFrame
df = df.replace({'Alice': 'Alicia', 'David': 'Dave'})
print(df)


      name     city
0   Alicia      NYC
1      Bob       LA
2  Charlie  Chicago
3     Dave  Houston


In [10]:
# Replace any name starting with 'A' with 'Starts with A'
df['name'] = df['name'].replace(r'^A.*', 'Starts with A', regex=True)
print(df)


            name     city
0  Starts with A      NYC
1            Bob       LA
2        Charlie  Chicago
3           Dave  Houston


## Encoding 

In [14]:
# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Gender': ['Female', 'Male', 'Male'],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert categorical columns to dummy variables
df_dummies = pd.get_dummies(df, columns=['Gender', 'City'])

df_dummies


Unnamed: 0,Name,Gender_Female,Gender_Male,City_Chicago,City_Los Angeles,City_New York
0,Alice,True,False,False,False,True
1,Bob,False,True,False,True,False
2,Charlie,False,True,True,False,False


# Read and Save data

This notebook is the reference code for getting Read and Save, pandas can read a variety of file types using its pd.read_ methods. Let's take a look at the most common data types:

In [21]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,rank,state,state_code,2020_census,percent_of_total
0,1,California,CA,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


In [43]:
df = pd.read_csv('data.csv',na_values='CA')
df.head()

Unnamed: 0,rank,state,state_code,2020_census,percent_of_total
0,1,California,,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


In [59]:
df = pd.read_csv('data.csv',index_col=['rank'])
df.head()

Unnamed: 0_level_0,state,state_code,2020_census,percent_of_total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,California,CA,39538223.0,0.1191
2,Texas,TX,29145505.0,0.0874
3,Florida,FL,21538187.0,0.0647
4,,NY,,0.0586
5,Pennsylvania,PA,13002700.0,0.0386


In [29]:
df = pd.read_csv('data.csv',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,rank,state,state_code,2020_census,percent_of_total
1,1,California,CA,39538223,0.1191
2,2,Texas,TX,29145505,0.0874
3,3,Florida,FL,21538187,0.0647
4,4,,NY,,0.0586


In [32]:
df = pd.read_csv('data.csv',header=0,names=['rank','St','St_co','2020_cen','Per_o_to'])
df.head()

Unnamed: 0,rank,St,St_co,2020_cen,Per_o_to
0,1,California,CA,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


# Saving Csv

In [23]:
df = pd.read_csv('data.csv')
df.to_csv('output.csv',index=False)