# File Cleaner and Organizer
This is the Notebook for the File Cleaner and Organizer App. The main app is on the .py file, but the notebook makes it easier to understand and try new changes! Feel free to download it and tweek the parameters and functions.
The main objective is to clean folders, like my 'Downloads' Folder that was a mess. While making the app, it came to mind that clustering can be really helpful for this, as it can group similar files and make the organizing more effective. Let's see how it works in this implementation.

## Imports

In [106]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

## Information extraction

In [76]:
# extract info about the files
def extract_file_info(folder_path): # returns for each file: [file_name, file_type, file_size, creation_date]
    info_list = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            file_type = os.path.splitext(file_path)[1][1:]
            file_size = os.path.getsize(file_path) / 1024
            creation_time = os.path.getctime(file_path)
            creation_date = datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d')
            info_list.append([file_name, file_type, file_size, creation_date])
    #info_df = pd.DataFrame(info_list, columns=['File Name', 'File Type', 'File Size (KB)', 'Creation Date'])
    return info_list

In [77]:
masters_path = r'C:\Users\rodri\OneDrive\Documents\Spark'
extract_file_info(masters_path)

[['actor.csv', 'csv', 4.33203125, '2024-05-30'],
 ['film.csv', 'csv', 179.3955078125, '2024-05-30'],
 ['film_actor.csv', 'csv', 44.63671875, '2024-05-30'],
 ['test.ipynb', 'ipynb', 0.8046875, '2024-05-31'],
 ['test_spark_session.py', 'py', 0.248046875, '2024-05-31']]

## File Organizer by Type (Straightforward way, no ML)

In [64]:
def organize_files_by_type(folder_path, info_list):
    for file_info in info_list:
        file_name = file_info[0]
        file_type = file_info[1]
        file_path = os.path.join(folder_path, file_name)
        print(f"Processing file: {file_path}")  # Debug print
        if os.path.isfile(file_path):
            type_folder_path = os.path.join(folder_path, file_type)
            target_path = os.path.join(type_folder_path, file_name)
            if not os.path.exists(type_folder_path):
                os.makedirs(type_folder_path)
                print(f"Created directory: {type_folder_path}")  # Debug print
            shutil.move(file_path, target_path)
            print(f"Moved {file_path} to {target_path}")

In [65]:
masters_path = r'C:\Users\rodri\OneDrive\Documents\MASTERS'
masters_info = extract_file_info(masters_path)
organize_files_by_type(masters_path, masters_info)

Processing file: C:\Users\rodri\OneDrive\Documents\MASTERS\scholaro-gpa UOL.pdf
Moved C:\Users\rodri\OneDrive\Documents\MASTERS\scholaro-gpa UOL.pdf to C:\Users\rodri\OneDrive\Documents\MASTERS\pdf\scholaro-gpa UOL.pdf
Processing file: C:\Users\rodri\OneDrive\Documents\MASTERS\Ugarte Sanguineti_certificate.pdf
Moved C:\Users\rodri\OneDrive\Documents\MASTERS\Ugarte Sanguineti_certificate.pdf to C:\Users\rodri\OneDrive\Documents\MASTERS\pdf\Ugarte Sanguineti_certificate.pdf
Processing file: C:\Users\rodri\OneDrive\Documents\MASTERS\UOL transcript.pdf
Moved C:\Users\rodri\OneDrive\Documents\MASTERS\UOL transcript.pdf to C:\Users\rodri\OneDrive\Documents\MASTERS\pdf\UOL transcript.pdf


## Clustering implementation

In [79]:
masters_path = r'C:\Users\rodri\Downloads'
file_info = extract_file_info(masters_path)
file_info

[['011cbd17-26c8-41bc-98d4-22aab6f0c8b4.pdf',
  'pdf',
  33.259765625,
  '2024-03-01'],
 ['107065230_10222958673178213_1447619766009901457_n.jpg',
  'jpg',
  382.1611328125,
  '2024-04-01'],
 ['292592821_5527017737320938_5845818422473006150_n.jpg',
  'jpg',
  48.2568359375,
  '2024-05-27'],
 ['about.jpg', 'jpg', 53.3193359375, '2023-08-31'],
 ['Alejandra Siu 17.doc', 'doc', 776.5, '2023-08-22'],
 ['Anaconda3-2024.02-1-Windows-x86_64.exe',
  'exe',
  926074.5078125,
  '2024-05-29'],
 ['Analista de Planeamiento Financiero y Control de Gestión_v3.docx',
  'docx',
  18.8115234375,
  '2023-11-12'],
 ['Application_Form-2024_Call1.xlsx', 'xlsx', 279.1279296875, '2024-04-22'],
 ['Appointment Booked - ESHC My Student Health Record.pdf',
  'pdf',
  56.55859375,
  '2024-04-16'],
 ['app_print.pdf', 'pdf', 52.642578125, '2024-04-14'],
 ['beers-tables.sql', 'sql', 2.806640625, '2024-02-14'],
 ['BFRO (1).zip', 'zip', 9173.5185546875, '2024-04-26'],
 ['BFRO.zip', 'zip', 9173.5185546875, '2024-04-25']

In [96]:
def prepare_features(file_info):
    df = pd.DataFrame(file_info, columns=['file name', 'file type', 'file size (KB)', 'creation date'])
    # featurize the file names
    vectorizer = TfidfVectorizer()
    name_features = vectorizer.fit_transform(df['file name'])
    # one hot encode file types
    type_features = pd.get_dummies(df['file type'])
    # scale features
    scaler = StandardScaler()
    size_features = scaler.fit_transform(df[['file size (KB)']])
    date_features = scaler.fit_transform(pd.to_datetime(df['creation date']).values.reshape(-1, 1))

    # create new df with all the features
    combined_features = pd.concat([
        pd.DataFrame(name_features.toarray()),
        type_features.reset_index(drop=True),
        pd.DataFrame(size_features, columns=['file size']),
        pd.DataFrame(date_features, columns=['creation date'])], axis=1)
    combined_features.columns = combined_features.columns.astype(str)
    
    return combined_features, df['file name'], vectorizer

In [97]:
features, file_names, vectorizer = prepare_features(file_info)
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,py,sql,tmp,tsv,txt,xlsx,xml,zip,file size,creation date
0,0.0,0.44184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,-0.287094,0.133246
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,-0.285301,0.336535
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,-0.287017,0.703766
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,-0.286991,-1.066814
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,-0.283273,-1.125834


In [104]:
def KMeans_clustering(features):
    num_files = features.shape[0]
    k_values = range(5, (num_files//4)+1)
    best_score = -1
    best_k = -1
    rand = np.random.randint(100)
    
    for k in k_values:
        cluster = KMeans(n_clusters=k, random_state=rand)
        cluster_labels = cluster.fit_predict(features)
        score = silhouette_score(features, cluster_labels)
        if score > best_score:
            best_score = score
            best_k = k
    print(f'The best k is {best_k} with a silhouette score of {best_score}')

    cluster = KMeans(n_clusters=best_k, random_state=rand)
    cluster_labels = cluster.fit_predict(features)
    
    return cluster_labels

In [105]:
labels = KMeans_clustering(features)
labels

The best k is 18 with a silhouette score of 0.20325982000798307


array([ 9,  0,  0,  0,  8,  4, 15, 10,  9,  9, 16,  2,  2, 16,  5,  9,  5,
        5,  5,  5,  9, 10, 10,  9,  9,  3, 13,  9,  5,  2,  9,  5,  7,  7,
        0, 16,  9,  9,  9,  4, 12, 15,  9, 16,  9, 11, 11,  9,  9, 14, 17,
       14,  8, 15,  6, 17,  8, 14,  0,  0,  0,  7,  2,  9, 16,  5, 16,  5,
       15,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  6,  6,  6,  6,  0,  7, 12,  9,  9, 17,
        9, 16,  4, 15,  9,  7, 16, 16,  7,  2,  3,  3,  3,  3])

This is the most important function, extract the name of the organized folder based on the names of the folders within, honestly i was not expecting them to make sense, but with this example they turned out better than expected.

In [114]:
def generate_folder_names(file_names, labels, vectorizer, n_words):
    folder_names = {}
    for label in set(labels):
        label_indices = np.where(labels == label)[0]
        label_file_names = [file_names[i] for i in label_indices]
        
        # combine all the numbers in a single string for analysis
        combined_names = " ".join(label_file_names)
        
        # TF-IDF vectorizer to extract the most important terms
        vectorized = vectorizer.transform([combined_names])
        feature_array = np.array(vectorizer.get_feature_names_out())
        tfidf_sorting = np.argsort(vectorized.toarray()).flatten()[::-1]
        top_terms = feature_array[tfidf_sorting][:n_words] # this could need some work as i generally play with the number to make the most sense out of it
        folder_name = " ".join(top_terms)
        folder_names[label] = folder_name
    return folder_names

In [117]:
folder_names = generate_folder_names(file_names, labels, vectorizer, 2) # 2 resulted to be concise and pretty useful for my case
folder_names

{0: 'jpg aafd',
 1: 'volumen sbr',
 2: 'zip bfro',
 3: 'tmp gre1806c460',
 4: '001 zip',
 5: 'pdf benjamin',
 6: 'png screenshot',
 7: 'exe dsci551',
 8: 'pptx doc',
 9: 'pdf generalwaiver',
 10: 'xlsx cuestionario',
 11: 'lab txt',
 12: 'py helper',
 13: 'docker desktop',
 14: 'msi 36',
 15: 'docx de',
 16: 'at jpeg',
 17: 'pgn java'}

In [123]:
def organize_files_clustering(base_path, file_names, labels, folder_names):
    for folder_name in set(folder_names.values()):
        folder_path = os.path.join(base_path, folder_name)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
    
    for file_name, label in zip(file_names, labels):
        folder_name = folder_names[label]
        source_path = os.path.join(base_path, file_name)
        destination_path = os.path.join(base_path, folder_name, file_name)
        shutil.move(source_path, destination_path)

In [124]:
organize_files_clustering(masters_path, file_names, labels, folder_names)

For testing purposes, this is a function to revert the organization into files. So we can test other parameters and see how that affects

In [None]:
def reverse_organization(base_path):
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                source_path = os.path.join(folder_path, file_name)
                destination_path = os.path.join(base_path, file_name)
                shutil.move(source_path, destination_path)
            os.rmdir(folder_path)