In [1]:
import pandas as pd
import os

def files_to_dataframe(directory):
    all_data = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath): 
            try:
                df = pd.read_csv(filepath)  
                df['filename'] = filename  
                all_data.append(df)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

In [2]:
# Example usage:
directory_path = 'C:\\programming_projects\\ASU\\web_crawl\\web_data\\'
df = files_to_dataframe(directory_path)
print(df)

Error reading webpage_analysis_run_11120.csv: No columns to parse from file
Error reading webpage_analysis_run_1164.csv: No columns to parse from file
Error reading webpage_analysis_run_1166.csv: No columns to parse from file
Error reading webpage_analysis_run_1168.csv: No columns to parse from file
Error reading webpage_analysis_run_1170.csv: No columns to parse from file
Error reading webpage_analysis_run_1172.csv: No columns to parse from file
Error reading webpage_analysis_run_1174.csv: No columns to parse from file
Error reading webpage_analysis_run_1176.csv: No columns to parse from file
Error reading webpage_analysis_run_1178.csv: No columns to parse from file
Error reading webpage_analysis_run_1410.csv: No columns to parse from file
Error reading webpage_analysis_run_1432.csv: No columns to parse from file
Error reading webpage_analysis_run_1434.csv: No columns to parse from file
Error reading webpage_analysis_run_1436.csv: No columns to parse from file
Error reading webpage_an

In [3]:
import pandas as pd
import re

def clean_text(text):
  """Keep only alphanumeric characters and basic punctuation."""
  text = re.sub(r'[^a-zA-Z0-9.,!?\s]', ' ', str(text))
  text = re.sub(r'\s+', ' ', text).strip()
  return text


# Apply the function to the DataFrame column
df['cleaned_text'] = df['page_text'].apply(clean_text)

# Remove all whitespace characters (including \r, \n, \t, \f, \v, and spaces)
df['cleaned_text'] = df['cleaned_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

df['title'] = df['title'].apply(clean_text)
df['topic'] = df['topic'].apply(clean_text)
print(df.shape)

(37908, 9)


In [4]:
df.head()

Unnamed: 0,url,depth,title,topic,word_count,char_count,page_text,filename,cleaned_text
0,https://admission.asu.edu/undergrad/nondegree,1,Nondegree student Admission ASU,Admissions Application Process,92.0,552.0,Nondegree requirementsTo be eligible to take c...,webpage_analysis_run_10.csv,Nondegree requirementsTo be eligible to take c...
1,https://asuonline.asu.edu/admission/first-year/,1,First Year Students ASU Online,ASU s Global Online Education,53.0,329.0,ASU Online empowers first-year students to tak...,webpage_analysis_run_10.csv,ASU Online empowers first year students to tak...
2,https://admission.asu.edu/apply/graduate/admis...,2,Graduate student admission Admission ASU,Admissions Application Process,264.0,1769.0,"In the graduate admission video series below, ...",webpage_analysis_run_100.csv,"In the graduate admission video series below, ..."
3,https://admission.asu.edu/cost-aid/graduate,2,Graduate student tuition Admission ASU,Housing Campus Life,58.0,404.0,Graduate student2025–2026 expensesAs a graduat...,webpage_analysis_run_100.csv,Graduate student2025 2026 expensesAs a graduat...
4,https://eoss.asu.edu/cora,3,Council of Religious Advisors Educational Outr...,Student Life Campus Activities,369.0,2525.0,"Celebrate faith, respect diversity, honor spir...",webpage_analysis_run_1000.csv,"Celebrate faith, respect diversity, honor spir..."


In [5]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def lang_detect(text):
  text = re.sub(r'[^a-zA-Z0-9.,!?\s]', '', str(text))
  language_detected = detect(text)
  return language_detected

df['langauage'] = ''
for i in range(0, len(df['langauage'])):
   try:
      df.loc[i,'langauage'] = lang_detect(df['cleaned_text'].iloc[i])
   except:
      print(i, '-', df['cleaned_text'].iloc[i])

19505 - 
28499 - .
32066 - 
32081 - 
32089 - 
32111 - 
36358 - , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,
36363 - , . , , . 2022 , 40 . , , . , , . , . , , , , , . , . . . , , . , . , . , . 40 . .
36366 - , 100 , , , 2022 , 40 , , , , , , , , , , , 100 , , , , , , 40
36379 - , 100 , , , 2022 , 40 , , , , , , , , , , , , . 100 , , , , , , 40
36396 - 100 2022 40 100 40


In [6]:
df['word_count'] = df['cleaned_text'].apply(lambda x: len(str(x).split()))

In [7]:
print(df.shape)
df = df[df['word_count'] > 50]
print(df.shape)
df = df[df['langauage'] == 'en']
print(df.shape)
df_clean = df.drop_duplicates(subset=['title', 'topic', 'word_count', 'char_count', 'cleaned_text', 'langauage'])
print(df_clean.shape)
df_clean.reset_index(drop=True, inplace=True)

(37908, 10)
(21843, 10)
(20956, 10)
(16343, 10)


In [8]:
df_clean.to_csv('cleaned_ASU_webpage.csv', index=False)

In [9]:
df_clean.head()

Unnamed: 0,url,depth,title,topic,word_count,char_count,page_text,filename,cleaned_text,langauage
0,https://admission.asu.edu/undergrad/nondegree,1,Nondegree student Admission ASU,Admissions Application Process,93,552.0,Nondegree requirementsTo be eligible to take c...,webpage_analysis_run_10.csv,Nondegree requirementsTo be eligible to take c...,en
1,https://asuonline.asu.edu/admission/first-year/,1,First Year Students ASU Online,ASU s Global Online Education,55,329.0,ASU Online empowers first-year students to tak...,webpage_analysis_run_10.csv,ASU Online empowers first year students to tak...,en
2,https://admission.asu.edu/apply/graduate/admis...,2,Graduate student admission Admission ASU,Admissions Application Process,268,1769.0,"In the graduate admission video series below, ...",webpage_analysis_run_100.csv,"In the graduate admission video series below, ...",en
3,https://admission.asu.edu/cost-aid/graduate,2,Graduate student tuition Admission ASU,Housing Campus Life,62,404.0,Graduate student2025–2026 expensesAs a graduat...,webpage_analysis_run_100.csv,Graduate student2025 2026 expensesAs a graduat...,en
4,https://eoss.asu.edu/cora,3,Council of Religious Advisors Educational Outr...,Student Life Campus Activities,373,2525.0,"Celebrate faith, respect diversity, honor spir...",webpage_analysis_run_1000.csv,"Celebrate faith, respect diversity, honor spir...",en


In [10]:
df_clean.tail()

Unnamed: 0,url,depth,title,topic,word_count,char_count,page_text,filename,cleaned_text,langauage
16338,https://students.asu.edu/diploma,4,Diploma ASU Students ASU,,600,,Diplomas\n\nDiplomas are mailed approximately ...,webpage_analysis_run_9980.csv,Diplomas Diplomas are mailed approximately six...,en
16339,http://graduation.asu.edu/ceremonies/additiona...,4,Additional Celebrations Graduation,,581,,Arizona State University's commitment to inclu...,webpage_analysis_run_9980.csv,Arizona State University s commitment to inclu...,en
16340,http://graduation.asu.edu/ceremonies/inclement...,4,Inclement Weather Plan Mountain America Stadiu...,,210,,If University Undergraduate Commencement or ot...,webpage_analysis_run_9980.csv,If University Undergraduate Commencement or ot...,en
16341,http://graduation.asu.edu/ceremonies/memories,4,"Regalia, photos, t shirts and flowers Graduation",,308,,"Caps, Gowns and Graduation Announcements\n\n\n...",webpage_analysis_run_9980.csv,"Caps, Gowns and Graduation Announcements Your ...",en
16342,http://graduation.asu.edu/ceremonies/futuredates,4,Future dates and Where to watch Graduation,,106,,Future dates\n\n\n\nThe University Academic Ca...,webpage_analysis_run_9980.csv,Future dates The University Academic Calendar ...,en
