In [8]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def basic_text_analysis(df, column_name='Complaint Description'):
    """
    Perform basic text analysis on the specified column
    """
    # Create a copy to avoid modifying original data
    analysis_df = df.copy()
    
    # Basic metrics
    analysis_df['word_count'] = analysis_df[column_name].str.split().str.len()
    analysis_df['char_count'] = analysis_df[column_name].str.len()
    analysis_df['avg_word_length'] = analysis_df[column_name].apply(lambda x: np.mean([len(word) for word in str(x).split()]))
    analysis_df['sentence_count'] = analysis_df[column_name].str.count('[.!?]+')
    
    # Calculate summary statistics
    summary_stats = {
        'avg_word_count': analysis_df['word_count'].mean(),
        'max_word_count': analysis_df['word_count'].max(),
        'min_word_count': analysis_df['word_count'].min(),
        'avg_char_count': analysis_df['char_count'].mean(),
        'avg_sentence_count': analysis_df['sentence_count'].mean()
    }
    
    return analysis_df, summary_stats

def preprocess_text(text):
    """
    Preprocess text for advanced analysis
    """
    # Convert to lowercase and remove special characters
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

def advanced_text_analysis(df, column_name='Complaint Description'):
    """
    Perform advanced text analysis including sentiment and common phrases
    """
    # Create a copy for analysis
    advanced_df = df.copy()
    
    # Sentiment analysis
    advanced_df['sentiment'] = advanced_df[column_name].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    advanced_df['subjectivity'] = advanced_df[column_name].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    
    # Process all complaints
    all_tokens = []
    for text in advanced_df[column_name]:
        tokens = preprocess_text(text)
        all_tokens.extend(tokens)
    
    # Get most common words
    word_freq = Counter(all_tokens)
    top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20])
    
    # Extract bigrams (common two-word phrases)
    bigrams = list(nltk.bigrams(all_tokens))
    bigram_freq = Counter(bigrams)
    top_bigrams = dict(sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True)[:10])
    
    # Additional metrics
    analysis_results = {
        'top_words': top_words,
        'top_bigrams': top_bigrams,
        'avg_sentiment': advanced_df['sentiment'].mean(),
        'avg_subjectivity': advanced_df['subjectivity'].mean(),
        'sentiment_distribution': advanced_df['sentiment'].value_counts(bins=5)
    }
    
    return advanced_df, analysis_results

def generate_text_report(df, column_name='Complaint Description'):
    """
    Generate a comprehensive text analysis report
    """
    # Run both analyses
    basic_df, basic_stats = basic_text_analysis(df, column_name)
    advanced_df, advanced_stats = advanced_text_analysis(df, column_name)
    
    # Print report
    print("=== Basic Text Analysis ===")
    for key, value in basic_stats.items():
        print(f"{key}: {value:.2f}")
        
    print("\n=== Advanced Text Analysis ===")
    print(f"\nTop 10 Most Common Words:")
    for word, count in list(advanced_stats['top_words'].items())[:10]:
        print(f"{word}: {count}")
        
    print(f"\nAverage Sentiment: {advanced_stats['avg_sentiment']:.2f}")
    print(f"Average Subjectivity: {advanced_stats['avg_subjectivity']:.2f}")
    
    return basic_df.join(advanced_df[['sentiment', 'subjectivity']])

[nltk_data] Downloading package punkt to /home/chandu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/chandu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/chandu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/chandu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chandu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = pd.read_csv("analysis.csv")
#generate_text_report(df)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
df['Complaint Description'] = df['Complaint Description'].apply(lambda x: x.lower())
df['Tokenized'] = df['Complaint Description'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Stopwords Removed'] = df['Tokenized'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize words
lemmatizer = WordNetLemmatizer()
df['Lemmatized'] = df['Stopwords Removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join back into strings for further analysis
df['Cleaned Text'] = df['Lemmatized'].apply(lambda x: '.'join(x))
		

In [None]:
nlp = spacy.load('en_core_web_sm')
df['Entities'] = df['Complaint Description'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])

In [None]:
# Create a dictionary from the lemmatized words
dictionary = corpora.Dictionary(df['Lemmatized'])

# Convert the dictionary into a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in df['Lemmatized']]

# Perform TF-IDF
tfidf = models.TfidfModel(corpus)

# Apply LDA
lda_model = models.LdaModel(corpus=tfidf[corpus], id2word=dictionary, passes=15, num_topics=5)
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
# Split data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(df['Cleaned Text'], df['Label'], test_size=0.2)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_text)
y_train = train_labels
X_test = vectorizer.transform(test_text)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy
print("Accuracy:", accuracy_score(test_labels, y_pred))