In [14]:
import os
import pandas as pd

In [28]:
news_dir = "BBC News Summary/BBC News Summary/News Articles"
summary_dir = "BBC News Summary/BBC News Summary/Summaries"

In [30]:
# Ensure the folders exist
if not os.path.exists(news_dir) or not os.path.exists(summary_dir):
    print("Error: 'news' or 'summary' folder not found.")
    exit()

# Iterate through each category
for category in os.listdir(news_dir):
    news_category_path = os.path.join(news_dir, category)
    summary_category_path = os.path.join(summary_dir, category)
    
    # Ensure the category exists in both folders
    if os.path.isdir(news_category_path) and os.path.isdir(summary_category_path):
        print(f"Processing category: {category}")
        
        # List to store the data for the current category
        data = []
        
        # Get all text files in the current category
        news_files = os.listdir(news_category_path)
        
        for news_file in news_files:
            # Full paths to the news and corresponding summary files
            news_file_path = os.path.join(news_category_path, news_file)
            summary_file_path = os.path.join(summary_category_path, news_file)
            
            # Check if both files exist
            if os.path.isfile(news_file_path) and os.path.isfile(summary_file_path):
                try:
                    # Read the news file content
                    with open(news_file_path, 'r', encoding='utf-8', errors='replace') as nf:
                        news_content = nf.read()
                    
                    # Read the summary file content
                    with open(summary_file_path, 'r', encoding='utf-8', errors='replace') as sf:
                        summary_content = sf.read()
                    
                    # Append the data to the list
                    data.append({
                        "News": news_content,
                        "Summary": summary_content
                    })
                except Exception as e:
                    print(f"Error reading files {news_file}: {e}")
            else:
                print(f"Skipping: {news_file} (matching file not found in 'summary')")
        
        # Save the data for the current category to a CSV file
        if data:
            df = pd.DataFrame(data)
            output_csv_path = f"{category}.csv"
            df.to_csv(output_csv_path, index=False, encoding='utf-8')
            print(f"CSV file created for category '{category}' at: {output_csv_path}")
        else:
            print(f"No data for category '{category}'. Skipping.")
    else:
        print(f"Skipping category: {category} (missing corresponding folder in 'summary')")

Processing category: business
CSV file created for category 'business' at: business.csv
Processing category: entertainment
CSV file created for category 'entertainment' at: entertainment.csv
Processing category: politics
CSV file created for category 'politics' at: politics.csv
Processing category: sport
Skipping: .ipynb_checkpoints (matching file not found in 'summary')
CSV file created for category 'sport' at: sport.csv
Processing category: tech
CSV file created for category 'tech' at: tech.csv


In [32]:
csv_files = ["business.csv", "entertainment.csv", "politics.csv", "sports.csv", "tech.csv"]

dataframes = []

for file in csv_files:
    try:
        df = pd.read_csv(file)
        dataframes.append(df)
    except FileNotFoundError:
        print(f"File not found: {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Concatenate all DataFrames
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_csv_path = "newsSummary.csv"
merged_df.to_csv(merged_csv_path, index=False, encoding='utf-8')

print(f"All files merged into: {merged_csv_path}")

File not found: sports.csv
All files merged into: newsSummary.csv


In [34]:
data = pd.read_csv("newsSummary.csv")

In [36]:
data.head()

Unnamed: 0,News,Summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [38]:
data.sample(10)

Unnamed: 0,News,Summary
11,Indonesians face fuel price rise\n\nIndonesia'...,Indonesia's government has confirmed it is con...
716,Jungle TV show ratings drop by 4m\n\nThe final...,"Pasquale follows Kerry McFadden, Phil Tufnell ..."
1222,Goldsmith denies war advice claim\n\nThe attor...,"Former minister Clare Short, who resigned from..."
292,Glazer makes new Man Utd approach\n\nMalcolm G...,Malcolm Glazer has made a fresh approach to bu...
726,Baywatch dubbed 'worst TV import'\n\nSurf show...,"The Jerry Springer Show, which came in at sixt..."
1549,Microsoft takes on desktop search\n\nMicrosoft...,Search giant Google launched its desktop searc...
821,Smith loses US box office crown\n\nNew comedy ...,After topping the chart for two consecutive we...
1086,Short attacks US over tsunami aid\n\nFormer Ca...,But Ms Short said the effect of the parallel c...
875,Bollywood DVD fraudster is jailed\n\nA major d...,Buhecha was previously a legitimate distributo...
585,Fockers fuel festive film chart\n\nComedy Meet...,"It took $19.1m (£9.9m) on Christmas Day alone,..."


In [40]:
data.shape

(1714, 2)

In [44]:
data = data.dropna(axis=1)

In [46]:
data.shape

(1714, 2)

In [48]:
data = data.dropna(how='all')  # Removes rows where all values are NaN
data = data[~data.applymap(lambda x: str(x).strip() == '').all(axis=1)]  # Removes rows with blank values

# Clean individual column entries by stripping leading/trailing whitespaces
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [50]:
data.shape

(1714, 2)

In [58]:
data.sample(20)

Unnamed: 0,News,Summary
1649,Can Yahoo dominate next decade? Yahoo has reac...,Both began life as search engines although in ...
1550,Gamers snap up new Sony PSP Gamers have bought...,Nintendo's goal is to ship 5 million of its ne...
270,Arsenal 'may seek full share listing' Arsenal ...,Speaking at the Soccerex football business for...
300,Libya takes $1bn in unfrozen funds Libya has w...,Libya has withdrawn $1bn in assets from the US...
1109,Custody death rate 'shocks' MPs Deaths in cust...,"""Yet throughout our inquiry we have seen time ..."
1007,Blair 'damaged' by Blunkett row A majority of ...,A total of 53% of those polled said they had s...
1123,Labour pig poster 'anti-Semitic' The Labour Pa...,"Labour said the poster was ""not anti-Jewish, b..."
359,Record year for Chilean copper Chile's copper ...,Chile's copper industry has registered record ...
1215,Hague's six-figure earnings shown The rewards ...,Mr Hague was also paid an undisclosed amount f...
1593,Cyber crime booms in 2004 The last 12 months h...,"Part of the reason that these ""bot nets"" have ..."


In [54]:
data = data.applymap(lambda x: x.replace('\n\n', ' ') if isinstance(x, str) else x)

In [60]:
data.to_csv('cleaned_data.csv', index=False)

In [62]:
average_length_column1 = data['News'].apply(lambda x: len(str(x))).mean()
average_length_column2 = data['Summary'].apply(lambda x: len(str(x))).mean()

print(f"Average length of Column1: {average_length_column1}")
print(f"Average length of Column2: {average_length_column2}")

Average length of Column1: 2368.995332555426
Average length of Column2: 1043.7432905484247


In [64]:
#news 2369 summary 1044