In [1]:
import pandas as pd
import re

# Load dataset
file_path = "Sandalwood_Scraping1.csv"  # Change the file path if needed
df = pd.read_csv(file_path)

# Extract dates from "Date of Birth" column
extracted_dates = df["Date of Birth"].str.extractall(r'(\d{4}-\d{2}-\d{2})')[0]

# Fill "DOB" column with the first extracted date
df["DOB"] = extracted_dates.groupby(level=0).first()

# Extract remaining text as "Additional Info"
df["Additional Info"] = df["Date of Birth"].apply(lambda x: re.sub(r'\d{4}-\d{2}-\d{2}', '', str(x)).strip())

# Clean "Additional Info" column: remove unwanted characters (like reference numbers [1][2])
df["Additional Info"] = df["Additional Info"].apply(lambda x: re.sub(r'\[\d+\]', '', x).strip() if isinstance(x, str) else x)

# Extract names (assuming other names are capitalized words) and ages from "Additional Info"
df["Other Name"] = df["Additional Info"].str.extract(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)')
df["Age"] = df["Additional Info"].str.extract(r'(\d+)')

# Define a list of month names to filter out from "Other Name"
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
df["Other Name"] = df["Other Name"].apply(lambda x: "Unknown" if isinstance(x, str) and any(month in x for month in months) else x)

# Drop "Additional Info" column if it is empty after extraction
df.drop(columns=["Additional Info"], inplace=True)

# Drop the original "Date of Birth" column
df.drop(columns=["Date of Birth"], inplace=True)

# Reorder columns to place "Other Name" and "DOB" after "Name"
columns_order = ["Name", "Other Name", "DOB"] + [col for col in df.columns if col not in ["Name", "Other Name", "DOB"]]
df = df[columns_order]

# Clean "Spouse" column: remove extra characters
if "Spouse" in df.columns:
    df["Spouse"] = df["Spouse"].str.replace(r'\s*\(.*?\)\s*', '', regex=True).str.strip()

# Clean "Children" column: ensure numerical values or NaN
df["Children"] = pd.to_numeric(df["Children"], errors='coerce')

# Remove unwanted characters (like reference numbers [1][2]) from all columns
def clean_text(text):
    if isinstance(text, str):
        return re.sub(r'\[\d+\]', '', text).strip()
    return text
df = df.applymap(clean_text)

# Fill missing values where possible
df.fillna("Unknown", inplace=True)

# Save cleaned data to a new CSV file
df.to_csv("Sandalwood_Cleaned.csv", index=False)

print("Data cleaning complete. Cleaned file saved as 'Sandalwood_Cleaned.csv'.")


Data cleaning complete. Cleaned file saved as 'Sandalwood_Cleaned.csv'.


  df = df.applymap(clean_text)
  df.fillna("Unknown", inplace=True)


In [2]:
df

Unnamed: 0,Name,Other Name,DOB,Spouse,Awards,Constituency,Political Party,Children,Relatives,Occupations,Website,Notable Works,Parents,Honours,Age
0,Priyamani,Priya Vasudevan Mani Iyer,1984-06-04,Mustafa Raj ​​,Full list,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4
1,Ramya,Divya Spandana,1982-11-29,Unknown,Unknown,Mandya,Indian National Congress,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,29
2,Radhika Pandit,Unknown,1984-03-07,Yash ​​,Unknown,Unknown,Unknown,2.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,7
3,Rashmika Mandanna,Unknown,1996-04-05,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,5
4,Rachita Ram,Bindhiya Ram,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Nithya Ram (sister),Unknown,Unknown,Unknown,Unknown,Unknown,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Kristina Akheeva,Unknown,1986-11-01,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,www .kristina-akheeva .com,Unknown,Unknown,Unknown,1
167,Kriti Kharbanda,Unknown,1990-10-29,Pulkit Samrat ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,29
168,Kruttikaa,Kruttika,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,https://kruttikaravindra.com/,Unknown,Unknown,Unknown,Unknown
169,Kumari Padmini,Padmini Chennai,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [3]:
print(df.dtypes)

Name               object
Other Name         object
DOB                object
Spouse             object
Awards             object
Constituency       object
Political Party    object
Children           object
Relatives          object
Occupations        object
Website            object
Notable Works      object
Parents            object
Honours            object
Age                object
dtype: object


In [5]:
pd.set_option('display.max_rows', None)  # Set to None to display all rows                                               
pd.set_option('display.max_columns', None) # Set to None to display all columns

In [6]:
df

Unnamed: 0,Name,Other Name,DOB,Spouse,Awards,Constituency,Political Party,Children,Relatives,Occupations,Website,Notable Works,Parents,Honours,Age
0,Priyamani,Priya Vasudevan Mani Iyer,1984-06-04,Mustafa Raj ​​,Full list,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4
1,Ramya,Divya Spandana,1982-11-29,Unknown,Unknown,Mandya,Indian National Congress,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,29
2,Radhika Pandit,Unknown,1984-03-07,Yash ​​,Unknown,Unknown,Unknown,2.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,7
3,Rashmika Mandanna,Unknown,1996-04-05,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,5
4,Rachita Ram,Bindhiya Ram,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Nithya Ram (sister),Unknown,Unknown,Unknown,Unknown,Unknown,3
5,Sruthi Hariharan,Unknown,1989-02-02,Raam Kumar ​​,Unknown,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2
6,Parul Yadav,Mumbai,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Actress producer,www .parulyadav .com,Unknown,Unknown,Unknown,1
7,Aindrita Ray,Unknown,Unknown,Diganth ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Actress model,Unknown,Unknown,Unknown,Unknown,19
8,Amulya,Moulya Bangalore,Unknown,Jagadish ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
9,Sapthami Gowda,Sapthami Gowda,1996-06-08,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,8


In [7]:
print(df['Name'])

0                    Priyamani
1                        Ramya
2               Radhika Pandit
3            Rashmika Mandanna
4                  Rachita Ram
5             Sruthi Hariharan
6                  Parul Yadav
7                 Aindrita Ray
8                       Amulya
9               Sapthami Gowda
10             Srinidhi Shetty
11            Ashika Ranganath
12            Aditi Prabhudeva
13              Nishvika Naidu
14               Sanjana Anand
15          Sangeetha Sringeri
16             Amrutha Iyengar
17                   Sreeleela
18              Milana Nagaraj
19               Shubha Poonja
20            Sharmiela Mandre
21               Pallavi Gowda
22                      Aamani
23                     Aarathi
24                   Anu Mehta
25                   Asha Bhat
26                 Amy Jackson
27                      Ambika
28                Archana Jois
29               Anu Prabhakar
30                Nagabhushana
31              Pruthvi Ambaar
32      

In [8]:
df_new = pd.read_csv("Sandalwood_Cleaned10feb.csv")

In [9]:
df_new

Unnamed: 0,Name,Other Name,DOB,Occupations,Spouse,Children,Relatives,Notable Works,Website,Political Party,Constituency,Awards,Parents,Age
0,V. Ravichandran,Veeraswamy Ravichandran,1961-05-30,Actor filmmaker composer Lyricist Television P...,Sumathy ​​,Unknown,Balaji (brother) [ 3 ],Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,30
1,Prem,Prem Kumar,1975-04-18,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,18
2,Vijay Raghavendra,Bengaluru,Unknown,Actor film director Television presenter,Spandana Vijay ​ ​​,1.0,Sriimurali (brother) See Rajkumar family,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,Rajesh,Munichowdappa,1932-04-15,Unknown,Unknown,Unknown,Arjun Sarja (son-in-law),Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,15
4,Uma Shivakumar,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,1
5,Sharath Lohithaswa,Sharatchandra Lohithaswa,1972-05-05,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,1
6,P. Ravi Shankar,Pudipeddi Ravi Shankar,1966-11-28,Actor dubbing artist writer director,Suchil,1.0,Sai Kumar (brother) Aadi (nephew),Partial list,Unknown,Unknown,Unknown,Unknown,Unknown,28
7,Girija Lokesh,Girija,1951-01-10,Unknown,Lokesh,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,10
8,Satya Prakash,Satya Prakash,Unknown,Actor director,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,1
9,Master Anand,Anand,1984-01-04,Actor film director Television presenter,Yashaswini ​​,Unknown,Unknown,Unknown,Official website,Unknown,Unknown,Unknown,Unknown,4


In [12]:
combined_df = pd.concat([df,df_new],ignore_index=True)


In [13]:
combined_df

Unnamed: 0,Name,Other Name,DOB,Spouse,Awards,Constituency,Political Party,Children,Relatives,Occupations,Website,Notable Works,Parents,Honours,Age
0,Priyamani,Priya Vasudevan Mani Iyer,1984-06-04,Mustafa Raj ​​,Full list,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4
1,Ramya,Divya Spandana,1982-11-29,Unknown,Unknown,Mandya,Indian National Congress,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,29
2,Radhika Pandit,Unknown,1984-03-07,Yash ​​,Unknown,Unknown,Unknown,2.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,7
3,Rashmika Mandanna,Unknown,1996-04-05,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,5
4,Rachita Ram,Bindhiya Ram,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Nithya Ram (sister),Unknown,Unknown,Unknown,Unknown,Unknown,3
5,Sruthi Hariharan,Unknown,1989-02-02,Raam Kumar ​​,Unknown,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2
6,Parul Yadav,Mumbai,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Actress producer,www .parulyadav .com,Unknown,Unknown,Unknown,1
7,Aindrita Ray,Unknown,Unknown,Diganth ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Actress model,Unknown,Unknown,Unknown,Unknown,19
8,Amulya,Moulya Bangalore,Unknown,Jagadish ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
9,Sapthami Gowda,Sapthami Gowda,1996-06-08,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,8


In [14]:
print(combined_df['Name'])

0                           Priyamani
1                               Ramya
2                      Radhika Pandit
3                   Rashmika Mandanna
4                         Rachita Ram
5                    Sruthi Hariharan
6                         Parul Yadav
7                        Aindrita Ray
8                              Amulya
9                      Sapthami Gowda
10                    Srinidhi Shetty
11                   Ashika Ranganath
12                   Aditi Prabhudeva
13                     Nishvika Naidu
14                      Sanjana Anand
15                 Sangeetha Sringeri
16                    Amrutha Iyengar
17                          Sreeleela
18                     Milana Nagaraj
19                      Shubha Poonja
20                   Sharmiela Mandre
21                      Pallavi Gowda
22                             Aamani
23                            Aarathi
24                          Anu Mehta
25                          Asha Bhat
26          

In [16]:
duplicate_count = combined_df.duplicated().sum()
print(f"Number of duplicate rows:{duplicate_count}")

Number of duplicate rows:0


In [17]:
df1 = pd.read_csv("Sandalwood_Cleaned16.csv")

In [18]:
df1

Unnamed: 0,Name,Other Name,DOB,Notable Works,Spouse,Children,Honours,Occupations,Awards,Relatives,Constituency,Political Party,Parents,Age
0,B. Saroja Devi,Unknown,Unknown,Full list,Sri Harsha ​ ​​,2.0,Padma Bhushan (1992) Padma Shri (1969),Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,Suhasini Maniratnam,Unknown,Unknown,Unknown,Mani Ratnam ​​,1.0,Unknown,Actress director producer writer Dubbing Artist,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,Anant Nag,Unknown,Unknown,Full list,Gayatri ​​,Unknown,Unknown,Unknown,Padma Bhushan (2025) Filmfare Award Rajyotsava...,Unknown,Unknown,Unknown,Unknown,Unknown
3,Devaraj,Unknown,Unknown,Unknown,Chandralekha [ 1 ],Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,Malashri,Unknown,Unknown,Unknown,Ramu ​ ​​ [ 1 ],Unknown,Unknown,Unknown,Unknown,Subhashri (step-sister),Unknown,Unknown,Unknown,Unknown
5,Prema,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Neravanda Aiyappa (brother),Unknown,Unknown,Unknown,Unknown
6,Avinash Yelandur,Unknown,Unknown,Unknown,Malavika ​​,1.0,Unknown,Actor professor,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
7,Srinath,Unknown,Unknown,Unknown,Geetha ​​,2.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
8,Bharathi Vishnuvardhan,Unknown,Unknown,Unknown,Vishnuvardhan ​ ​​,2.0,Unknown,Unknown,Padma Shri (2017),Unknown,Unknown,Unknown,Unknown,Unknown
9,Umashree,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Terdal,Indian National Congress,Unknown,Unknown


In [19]:
NEW_df = pd.concat([combined_df,df1],ignore_index=True)

In [21]:
NEW_df

Unnamed: 0,Name,Other Name,DOB,Spouse,Awards,Constituency,Political Party,Children,Relatives,Occupations,Website,Notable Works,Parents,Honours,Age
0,Priyamani,Priya Vasudevan Mani Iyer,1984-06-04,Mustafa Raj ​​,Full list,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4
1,Ramya,Divya Spandana,1982-11-29,Unknown,Unknown,Mandya,Indian National Congress,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,29
2,Radhika Pandit,Unknown,1984-03-07,Yash ​​,Unknown,Unknown,Unknown,2.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,7
3,Rashmika Mandanna,Unknown,1996-04-05,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,5
4,Rachita Ram,Bindhiya Ram,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Nithya Ram (sister),Unknown,Unknown,Unknown,Unknown,Unknown,3
5,Sruthi Hariharan,Unknown,1989-02-02,Raam Kumar ​​,Unknown,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2
6,Parul Yadav,Mumbai,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Actress producer,www .parulyadav .com,Unknown,Unknown,Unknown,1
7,Aindrita Ray,Unknown,Unknown,Diganth ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Actress model,Unknown,Unknown,Unknown,Unknown,19
8,Amulya,Moulya Bangalore,Unknown,Jagadish ​​,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
9,Sapthami Gowda,Sapthami Gowda,1996-06-08,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,8


In [22]:
duplicate_count = NEW_df.duplicated().sum()
print(f"Number of duplicate rows:{duplicate_count}")

Number of duplicate rows:0


In [23]:
duplicates = NEW_df['Name'].duplicated()
print(duplicates)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50     False
51     False
52     False
53     False
54     False
55     False
56     False
57     False
58     False
59     False
60     False
61     False
62     False
63     False
64     False
65     False
66     False
67     False
68     False
69     False
70     False
71     False
72     False
73     False
74     False
75     False
76     False

In [24]:
# Save cleaned data to a new CSV file
NEW_df.to_csv("Sandalwood260.csv", index=False)