In [None]:
!pip install textstat
!pip install imbalanced-learn==0.10.1
!pip install --upgrade scikit-learn



In [None]:
!!pip install --upgrade imbalanced-learn


# !/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Financial Fraud Text Analysis - Exploratory Data Analysis
This script performs exploratory data analysis on financial fraud text data,
analyzing text features, sentiment, readability, and addressing class imbalance.
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import textstat
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

# Download necessary NLTK resources
# Fix SSL certificate issues for NLTK downloads
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Create NLTK data directory if it doesn't exist
nltk_data_dir = os.path.expanduser('~/nltk_data')
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Download NLTK resources
print("Downloading NLTK resources...")
nltk.download('punkt', quiet=False)
nltk.download('stopwords', quiet=False)
nltk.download('punkt_tab', quiet=False)

# Alternative approach for tokenization if punkt_tab is not available
def safe_tokenize(text, tokenize_func):
    """Safely tokenize text, falling back to simple split if tokenization fails"""
    try:
        return tokenize_func(text)
    except LookupError:
        # Fallback to simple splitting
        if tokenize_func == word_tokenize:
            return text.split()
        elif tokenize_func == sent_tokenize:
            # Simple sentence splitting by punctuation
            sentences = re.split(r'(?<=[.!?])\s+', text)
            return [s for s in sentences if s.strip()]
        return []

# Load the dataset
print("Loading dataset...")
data_path = '/kaggle/input/financial-statement-fraud-data/Final_Dataset.csv'
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")

# Display basic information
print("\nBasic Information:")
print(df.info())
print("\nClass distribution:")
print(df['Fraud'].value_counts())
print(f"Class balance ratio: {df['Fraud'].value_counts().min() / df['Fraud'].value_counts().max():.2f}")

# Create a copy of the dataframe for analysis
df_analysis = df.copy()

# Convert target to binary
df_analysis['Fraud_Binary'] = df_analysis['Fraud'].map({'yes': 1, 'no': 0})

# Text Preprocessing Function
def preprocess_text(text):
    """Basic text preprocessing"""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
df_analysis['Processed_Text'] = df_analysis['Fillings'].apply(preprocess_text)

# Text Length Analysis
print("\nAnalyzing text length distributions...")
df_analysis['Text_Length'] = df_analysis['Fillings'].apply(len)
df_analysis['Word_Count'] = df_analysis['Processed_Text'].apply(lambda x: len(safe_tokenize(x, word_tokenize)))
df_analysis['Sentence_Count'] = df_analysis['Fillings'].apply(lambda x: len(safe_tokenize(x, sent_tokenize)))
df_analysis['Avg_Word_Length'] = df_analysis['Processed_Text'].apply(
    lambda x: np.mean([len(word) for word in safe_tokenize(x, word_tokenize)]) if safe_tokenize(x, word_tokenize) else 0
)
df_analysis['Avg_Sentence_Length'] = df_analysis['Word_Count'] / df_analysis['Sentence_Count'].replace(0, 1)

# Readability Analysis
print("Calculating readability metrics...")
df_analysis['Flesch_Reading_Ease'] = df_analysis['Fillings'].apply(textstat.flesch_reading_ease)
df_analysis['Flesch_Kincaid_Grade'] = df_analysis['Fillings'].apply(textstat.flesch_kincaid_grade)
df_analysis['Automated_Readability_Index'] = df_analysis['Fillings'].apply(textstat.automated_readability_index)

# Sentiment Analysis
print("Performing sentiment analysis...")
df_analysis['Polarity'] = df_analysis['Fillings'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_analysis['Subjectivity'] = df_analysis['Fillings'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Summary statistics by class
print("\nSummary statistics by fraud class:")
fraud_stats = df_analysis.groupby('Fraud')[
    ['Text_Length', 'Word_Count', 'Sentence_Count', 'Avg_Word_Length',
     'Avg_Sentence_Length', 'Flesch_Reading_Ease', 'Flesch_Kincaid_Grade',
     'Polarity', 'Subjectivity']
].agg(['mean', 'median', 'std']).T
print(fraud_stats)

# Create a directory for plots if it doesn't exist
if not os.path.exists('plots'):
    os.makedirs('plots')

# Visualization functions
def plot_distribution(df, column, title, xlabel, ylabel="Frequency", bins=30, figsize=(10, 6)):
    """Plot distribution of a column with fraud/non-fraud distinction"""
    plt.figure(figsize=figsize)

    # Plot histograms for each class
    sns.histplot(data=df[df['Fraud'] == 'yes'], x=column, bins=bins, alpha=0.5, label='Fraud', color='red')
    sns.histplot(data=df[df['Fraud'] == 'no'], x=column, bins=bins, alpha=0.5, label='Non-Fraud', color='blue')

    plt.title(title, fontsize=15)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'plots/{column}_distribution.png')
    plt.close()

def plot_boxplot(df, column, title, ylabel, figsize=(10, 6)):
    """Plot boxplot of a column with fraud/non-fraud distinction"""
    plt.figure(figsize=figsize)

    sns.boxplot(x='Fraud', y=column, data=df)

    plt.title(title, fontsize=15)
    plt.xlabel('Fraud Status', fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.tight_layout()
    plt.savefig(f'plots/{column}_boxplot.png')
    plt.close()

# Generate visualizations
print("\nGenerating visualizations...")

# Text length distributions
plot_distribution(df_analysis, 'Text_Length', 'Distribution of Text Length by Fraud Status', 'Text Length (characters)')
plot_boxplot(df_analysis, 'Text_Length', 'Text Length by Fraud Status', 'Text Length (characters)')

# Word count distributions
plot_distribution(df_analysis, 'Word_Count', 'Distribution of Word Count by Fraud Status', 'Word Count')
plot_boxplot(df_analysis, 'Word_Count', 'Word Count by Fraud Status', 'Word Count')

# Sentence count distributions
plot_distribution(df_analysis, 'Sentence_Count', 'Distribution of Sentence Count by Fraud Status', 'Sentence Count')
plot_boxplot(df_analysis, 'Sentence_Count', 'Sentence Count by Fraud Status', 'Sentence Count')

# Readability metrics
plot_distribution(df_analysis, 'Flesch_Reading_Ease', 'Distribution of Flesch Reading Ease by Fraud Status', 'Flesch Reading Ease')
plot_boxplot(df_analysis, 'Flesch_Reading_Ease', 'Flesch Reading Ease by Fraud Status', 'Flesch Reading Ease')

# Sentiment analysis
plot_distribution(df_analysis, 'Polarity', 'Distribution of Text Polarity by Fraud Status', 'Polarity')
plot_boxplot(df_analysis, 'Polarity', 'Text Polarity by Fraud Status', 'Polarity')

plot_distribution(df_analysis, 'Subjectivity', 'Distribution of Text Subjectivity by Fraud Status', 'Subjectivity')
plot_boxplot(df_analysis, 'Subjectivity', 'Text Subjectivity by Fraud Status', 'Subjectivity')

# Correlation heatmap
print("Generating correlation heatmap...")
plt.figure(figsize=(12, 10))
correlation_cols = ['Text_Length', 'Word_Count', 'Sentence_Count', 'Avg_Word_Length',
                    'Avg_Sentence_Length', 'Flesch_Reading_Ease', 'Flesch_Kincaid_Grade',
                    'Automated_Readability_Index', 'Polarity', 'Subjectivity', 'Fraud_Binary']
corr_matrix = df_analysis[correlation_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm",
            vmin=-1, vmax=1, square=True, linewidths=.5)
plt.title('Correlation Heatmap of Text Features', fontsize=15)
plt.tight_layout()
plt.savefig('plots/correlation_heatmap.png')
plt.close()

# Word frequency analysis
print("Analyzing word frequencies...")

# Function to get most common words
def get_common_words(text_series, n=20, min_word_length=3):
    """Get most common words from a series of texts"""
    stop_words = set(stopwords.words('english'))
    all_words = []

    for text in text_series:
        words = [word.lower() for word in safe_tokenize(text, word_tokenize)
                if word.lower() not in stop_words
                and len(word) >= min_word_length
                and word.isalpha()]
        all_words.extend(words)

    return Counter(all_words).most_common(n)

# Get common words for fraud and non-fraud texts
fraud_common = get_common_words(df_analysis[df_analysis['Fraud'] == 'yes']['Processed_Text'])
non_fraud_common = get_common_words(df_analysis[df_analysis['Fraud'] == 'no']['Processed_Text'])

# Plot word frequencies
def plot_word_freq(word_counts, title, color, filename):
    """Plot word frequencies"""
    plt.figure(figsize=(12, 8))
    words, counts = zip(*word_counts)
    sns.barplot(x=list(counts), y=list(words), palette=[color])
    plt.title(title, fontsize=15)
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Words', fontsize=12)
    plt.tight_layout()
    plt.savefig(f'plots/{filename}.png')
    plt.close()

plot_word_freq(fraud_common, 'Most Common Words in Fraudulent Texts', 'red', 'fraud_common_words')
plot_word_freq(non_fraud_common, 'Most Common Words in Non-Fraudulent Texts', 'blue', 'non_fraud_common_words')

# Generate word clouds
print("Generating word clouds...")

def generate_wordcloud(text_series, title, color_map, filename):
    """Generate and save a word cloud from a series of texts"""
    all_text = ' '.join(text_series)
    wordcloud = WordCloud(width=800, height=400, background_color='white',
                         max_words=100, colormap=color_map,
                         contour_width=1, contour_color='steelblue')
    wordcloud.generate(all_text)

    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=15)
    plt.tight_layout()
    plt.savefig(f'plots/{filename}.png')
    plt.close()

generate_wordcloud(df_analysis[df_analysis['Fraud'] == 'yes']['Processed_Text'],
                  'Word Cloud - Fraudulent Texts', 'Reds', 'fraud_wordcloud')
generate_wordcloud(df_analysis[df_analysis['Fraud'] == 'no']['Processed_Text'],
                  'Word Cloud - Non-Fraudulent Texts', 'Blues', 'non_fraud_wordcloud')

# Class Imbalance Handling with SMOTE-NC
print("\nDemonstrating class imbalance handling with SMOTE-NC...")
# Note: Our dataset is already balanced, but we'll demonstrate SMOTE-NC for educational purposes

# Prepare data for SMOTE-NC
# We'll use TF-IDF vectorization for the text features
print("Vectorizing text data...")
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df_analysis['Processed_Text'])

# Convert to DataFrame for easier handling
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Add some numerical features
X_combined = pd.concat([
    X_tfidf_df,
    df_analysis[['Text_Length', 'Word_Count', 'Sentence_Count', 'Flesch_Reading_Ease', 'Polarity', 'Subjectivity']]
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, df_analysis['Fraud_Binary'], test_size=0.3, random_state=42
)

# Apply SMOTE-NC
# Identify categorical features (in this case, all TF-IDF features are categorical)
categorical_features = list(range(len(tfidf.get_feature_names_out())))
smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

print(f"Original training data shape: {X_train.shape}, Class distribution: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Resampled training data shape: {X_resampled.shape}, Class distribution: {pd.Series(y_resampled).value_counts().to_dict()}")

# Save analysis results to CSV
print("\nSaving analysis results...")
try:
    # Create data directory if it doesn't exist
    if not os.path.exists('data'):
        os.makedirs('data')
    df_analysis.to_csv('data/financial_fraud_analysis.csv', index=False)
    print("Analysis results saved successfully.")
except Exception as e:
    print(f"Error saving analysis results: {e}")

print("\nExploratory Data Analysis completed successfully!")
print("Results and visualizations have been saved to the 'plots' directory.")
print("Detailed analysis data has been saved to 'data/financial_fraud_analysis.csv'.")


In [3]:
import os, zipfile
from IPython.display import FileLink

def zipdir(path, ziph):
    # ziph: a ZipFile handle
    for root, dirs, files in os.walk(path):
        for fname in files:
            filepath = os.path.join(root, fname)
            # preserve the folder structure under “output/”
            arcname = os.path.relpath(filepath, os.path.dirname(path))
            ziph.write(filepath, arcname)

output_folder = '/kaggle/working/'
zip_path = 'output.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    zipdir(output_folder, z)

print(f"📦 Packed everything into {zip_path}")
FileLink(zip_path)


📦 Packed everything into output.zip
